In [5]:
from __future__ import (absolute_import, division,
print_function, unicode_literals)
import warnings
warnings.simplefilter('ignore')
# general purpose packages
import pandas as pd
import numpy as np
import os
import json
import time
import re
import csv
import subprocess
import sys
import scipy.stats as stats
import statsmodels.stats as smstats
from statsmodels.stats.multitest import multipletests
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import sklearn
import umap
from multiprocessing import Process, Manager, Pool
import multiprocessing
from functools import partial
from collections import Counter
import seaborn as sns; sns.set()
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
matplotlib.rcParams['backend'] = "Qt5Agg"
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter
from IPython.display import display, Image
from adjustText import adjust_text
import builtins
%matplotlib inline
# for working with sam/bam files
import HTSeq
# for working with yaml files
import ruamel.yaml
import itertools
In [27]:
# paths to subdirectories
subdirs = {}
subdirs['main_project_dir'] = '/scicore/home/zavolan/GROUP/Primer_Probe_design/'
subdirs['human_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/'
subdirs['swine_annotation_dir'] = '/scicore/home/zavolan/GROUP/Genomes/swine/'
subdirs['shared_project_dir'] = subdirs['main_project_dir']
subdirs['PRRSV_dir'] = subdirs['shared_project_dir']+'PRRSV/'
subdirs['PRRSV_github_dir'] = subdirs['PRRSV_dir']+'github/'
subdirs['PRRSV_materials_dir'] = subdirs['PRRSV_dir']+'materials/'
subdirs['PRRSV_reference_genomes_dir'] = subdirs['PRRSV_dir']+'reference_genomes/'
# technicals
subdirs['temp_dir'] = subdirs['shared_project_dir']+'temp/'
subdirs['figures_dir'] = subdirs['shared_project_dir']+'figures/'
subdirs['tables_dir'] = subdirs['shared_project_dir']+'tables/'
# paths to files
file_paths = {}
### genome annotation files
file_paths['human_genome_file'] = subdirs['human_annotation_dir']+'GRCh38.primary_assembly.genome.fa'
file_paths['human_genome_fai_file'] = subdirs['human_annotation_dir']+'GRCh38.primary_assembly.genome.fa.fai'
file_paths['human_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/gencode.v42.annotation.gtf'
file_paths['human_RNAcentral_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/homo_sapiens.GRCh38.gff3.gz'
file_paths['human_enriched_annotation_file'] = subdirs['human_annotation_dir']+'hg38_v42/enriched.gencode.v42.annotation.gtf'
file_paths['human_chrom_sizes'] = subdirs['human_annotation_dir']+'hg38.chrom.sizes'
os.system('mkdir -p '+' '.join(list(subdirs.values()))) # create all subdirs
Out[27]:
0
PRRSV - track hub¶
Initial github configuration¶
In [37]:
# cd to subdirs['PRRSV_github_dir']
# git init
# then create github repo on the github website
# then create test file
# then, for every change to put to the repo online:
# git add -A
# git commit -m "message"
# git push -u origin main
reference genomes for PRRSV-type 1 (Lelystad) and PRRSV-type 2 (VR-2332)¶
In [32]:
command = 'esearch -db nucleotide -query "'+'M96262'+'" | efetch -format fasta > '+subdirs['PRRSV_reference_genomes_dir']+'M96262.Lelystad.PRRSV_type1.fasta'+'; '
command = command+'esearch -db nucleotide -query "'+'U87392'+'" | efetch -format fasta > '+subdirs['PRRSV_reference_genomes_dir']+'U87392.VR2332.PRRSV_type2.fasta'+'; '
command = command+'cat '+subdirs['PRRSV_reference_genomes_dir']+'M96262.Lelystad.PRRSV_type1.fasta '+subdirs['PRRSV_reference_genomes_dir']+'U87392.VR2332.PRRSV_type2.fasta > '+subdirs['PRRSV_reference_genomes_dir']+'PRRSV_reference_genomes.fasta'
command
Out[32]:
'esearch -db nucleotide -query "M96262" | efetch -format fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/M96262.Lelystad.PRRSV_type1.fasta; esearch -db nucleotide -query "U87392" | efetch -format fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/U87392.VR2332.PRRSV_type2.fasta; cat /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/M96262.Lelystad.PRRSV_type1.fasta /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/U87392.VR2332.PRRSV_type2.fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/reference_genomes/PRRSV_reference_genomes.fasta'
In [38]:
subdirs['PRRSV_github_dir']
Out[38]:
'/scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/github/'
In [ ]:
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [ ]:
myHub/ - directory to organize your files on this hub
hub.txt – primary reference text file to define the hub, refers to:
genomes.txt – definitions for each genome assembly on this hub
newOrg1/ - directory of files for this specific genome assembly
newOrg1.2bit – ‘2bit’ file constructed from your fasta sequence
description.html – information about this assembly for users
trackDb.txt – definitions for tracks on this genome assembly
groups.txt – definitions for track groups on this assembly
bigWig and bigBed files – data for tracks on this assembly
external track hub data tracks can be displayed on this assembly
In [ ]:
# define hub.txt
"""hub PRRSV_Hub
shortLabel PRRSV
longLabel PRRSV
useOneFile on
email magmir71@gmail.com
descriptionUrl https://raw.githubusercontent.com/zavolanlab/primer_probe_design/main/track_descriptions/Track_hub_description.html
genome hg38
defaultPos chr3:108042402-108047903
organism Homo Sapiens
In [ ]:
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")
command = ''
bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 20
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html
"""
f.write(bigbed_composite)
In [33]:
subdirs['PRRSV_github_dir']
Out[33]:
'/scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/github/'
In [ ]:
In [ ]:
In [ ]:
download canadian WGS of PPRSV-type2 from GENBANK¶
In [19]:
paths_tsv = pd.read_csv(subdirs['PRRSV_materials_dir']+'WGS_PMID32817228/WGS_Canada.tsv',delimiter="\t",index_col=None,header=0)
paths_tsv['sample'] = paths_tsv['GenBank']+'.'+paths_tsv['sample_type']+'.'+paths_tsv['number_of_pooled_animals'].str.replace('.','')
paths_tsv['target_path'] = subdirs['PRRSV_materials_dir']+'WGS_PMID32817228/'+paths_tsv['sample']+'.fasta'
In [26]:
# try downloading everything to single fasta file
target_path = subdirs['PRRSV_materials_dir']+'WGS_PMID32817228/whole_genome_seqs.PRRSV_2.fasta'
command = ''
for index,row in paths_tsv.iterrows():
command = command+'esearch -db nucleotide -query "'+row['GenBank']+'" | efetch -format fasta '+('>' if index==0 else '>>')+' '+target_path+'; '
command
Out[26]:
'esearch -db nucleotide -query "MN865482" | efetch -format fasta > /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865566" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865567" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865483" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865484" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865485" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865486" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865487" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865488" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865568" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865569" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865489" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865490" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865491" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865492" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865493" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865494" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865495" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865496" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865570" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865571" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865497" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865498" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865499" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865500" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865501" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865502" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865503" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865504" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865505" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865506" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865507" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865508" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865509" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865510" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865511" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865512" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865513" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865514" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865515" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865516" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865517" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865518" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865519" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865520" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865521" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865522" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865523" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865524" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865525" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865526" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865527" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865528" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865529" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865530" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865531" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865532" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865533" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865534" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865535" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865536" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865537" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865538" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865539" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865540" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865541" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865542" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865543" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865544" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865545" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865546" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865547" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865548" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865549" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865550" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865551" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865552" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865553" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865554" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865555" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865556" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865557" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865558" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865559" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865560" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865561" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865562" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865572" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865573" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865563" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865564" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; esearch -db nucleotide -query "MN865565" | efetch -format fasta >> /scicore/home/zavolan/GROUP/Primer_Probe_design/PRRSV/materials/WGS_PMID32817228/whole_genome_seqs.fasta; '
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
human¶
In [366]:
organism = 'human'
num_of_samples = 813
organism_label = organism if organism!='celegans' else 'worm'
df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'
SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)
ourdir = os.path.dirname(file_paths[organism+'_output_PAS_bed_gz'])
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [367]:
tissues = ['prostate', 'skin', 'penis', 'intestine', 'heart', 'kidney', 'breast',
'lung', 'uterus', 'nose', 'pancreas', 'trachea', 'bone', 'eye', 'liver','ureter', 'brain','bloodImmune']
In [368]:
###
# make bed file
###
mp_cols = [col for col in SCINPAS.columns if 'MP' in col]
v3_point = get_points_others(SCINPAS, 'v3')
results = get_points(SCINPAS, mp_cols)
results.append(v3_point)
results_df = pd.DataFrame(results, columns=['source', 'input_thr', 'num_pas', 'motif_percentage'])
results_df['num_pas'] = results_df['num_pas'].astype('int')
results_df['motif_percentage'] = results_df['motif_percentage'].astype('float')
results_df = results_df.sort_values('motif_percentage',ascending=True).reset_index(drop=True)
results_df['short_MP'] = np.round(results_df['motif_percentage'],0).astype('int')
mp_to_report_df = results_df.drop_duplicates('short_MP').reset_index(drop=True)
In [369]:
mp_to_report_df
Out[369]:
| source | input_thr | num_pas | motif_percentage | short_MP | |
|---|---|---|---|---|---|
| 0 | v3 | full | 18432135 | 20.056922 | 20 |
| 1 | v3 | 20_MP | 16605700 | 21.215884 | 21 |
| 2 | v3 | 25_MP | 13068497 | 23.315596 | 23 |
| 3 | v3 | 30_MP | 8775588 | 26.246629 | 26 |
| 4 | v3 | 35_MP | 5478736 | 29.275676 | 29 |
| 5 | v3 | 40_MP | 3348203 | 32.333494 | 32 |
| 6 | v3 | 45_MP | 2004409 | 35.653801 | 36 |
| 7 | v3 | 50_MP | 1296263 | 38.872513 | 39 |
| 8 | v3 | 60_MP | 604072 | 46.730688 | 47 |
| 9 | v3 | 65_MP | 426465 | 51.415239 | 51 |
| 10 | v3 | 70_MP | 312120 | 56.573433 | 57 |
| 11 | v3 | 75_MP | 236099 | 61.822371 | 62 |
| 12 | v3 | 80_MP | 173253 | 68.589866 | 69 |
| 13 | v3 | 85_MP | 121869 | 77.469250 | 77 |
| 14 | v3 | 90_MP | 88284 | 86.715600 | 87 |
| 15 | v3 | 95_MP | 64777 | 97.511462 | 98 |
In [370]:
SCINPAS['full'] = 1
SCINPAS['alt_score'] = SCINPAS[list(mp_to_report_df['input_thr'])].mul(list(mp_to_report_df['short_MP'])).max(1) # stringency
SCINPAS['alt_id'] = SCINPAS.apply(lambda x:':'.join((x['id'].split(':')[:3])),1)
# SCINPAS['%_of_samples_with_support'] = np.round(SCINPAS['supp']/num_of_samples*100,3)
SCINPAS['%_of_tissues_with_support'] = np.round(100*np.count_nonzero(SCINPAS[tissues],1)/len(tissues),1) # calculate percentage of tissues instead of samples
SCINPAS['num_of_protocols'] = 1
SCINPAS['avg_expression'] = np.round(SCINPAS[tissues].mean(1),6) # mean of means
tmp = SCINPAS[['segment_class']].drop_duplicates().reset_index(drop=True)
PAS_cat_dict = {'TE':'TE','I':'IN','E':'EX','A':'AL','D_I':'DI','U_I':'UI','N':'NA'}
tmp['PAS_cat'] = tmp['segment_class'].map(PAS_cat_dict)
if 'PAS_cat' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('PAS_cat',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='segment_class')
motif_cols = ['AAUAAA', 'AUUAAA','UAUAAA', 'AAGAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA','GAUAAA', 'ACUAAA', 'AAUAGA']
def fill_motif(x):
res = ';'.join([elem for elem in motif_cols if x[elem]==1])
if res=='':
res = 'NaN'
return res
tmp = SCINPAS[motif_cols].drop_duplicates().reset_index(drop=True)
tmp['motif'] = tmp.apply(lambda x: fill_motif(x), 1)
if 'motif' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('motif',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on=motif_cols)
tmp = SCINPAS[['seqid']].drop_duplicates().reset_index(drop=True)
tmp['chr'] = tmp['seqid'].str.replace('chr','').str.replace('X','23').str.replace('Y','24').astype('int')
if 'chr' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('chr',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='seqid')
SCINPAS = SCINPAS.sort_values(['chr','start','end','strand']).reset_index(drop=True)
SCINPAS[['seqid','start','end','alt_id','avg_expression',
'strand','%_of_tissues_with_support',
'num_of_protocols','alt_score','PAS_cat','motif']].to_csv(file_paths[organism+'_output_PAS_bed_gz'],
sep=str('\t'),header=False,index=None,
quoting=csv.QUOTE_NONE,compression='gzip')
In [371]:
Counter(SCINPAS['PAS_cat'])
Out[371]:
Counter({'DI': 3920458,
'UI': 2454745,
'TE': 564914,
'IN': 10507671,
'EX': 348266,
'AL': 635988,
'NA': 93})
In [373]:
###
# correct tsv file
###
input_gtf = pd.read_csv(file_paths[organism+'_annotation_file'], delimiter = '\t', header = None, skiprows=5)
input_gtf[[0,1,2,5,6,7]] = input_gtf[[0,1,2,5,6,7]].astype('category') # to decrease size of the DF
input_gtf[[3,4]] = input_gtf[[3,4]].astype('int')
genes = input_gtf.loc[input_gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('";',expand=True)[0]
genes['gene_name'] = genes[8].str.split('gene_name "',expand=True)[1].str.split('";',expand=True)[0]
SCINPAS_tsv = SCINPAS.copy()
SCINPAS_tsv = pd.merge(SCINPAS_tsv,genes[['gene_id','gene_name']].drop_duplicates().reset_index(drop=True).rename(columns = {'gene_id':'reassigned_g'}),how='left',on='reassigned_g')
SCINPAS_tsv['rep'] = SCINPAS_tsv.apply(lambda x:int(x['alt_id'].split(':')[1]),1)
SCINPAS_tsv = SCINPAS_tsv.rename(columns={'seqid':'chrom','start':'chromStart','end':'chromEnd','alt_id':'name','score':'score_original','alt_score':'stringency_level','strand':'strand',
'%_of_tissues_with_support':'perc_tissues','num_of_protocols':'nr_prots','PAS_cat':'annotation','gene_id':'gene_id_original','reassigned_g':'gene_id','motif':'repSite_signals'})
SCINPAS_tsv = SCINPAS_tsv[['chrom','chromStart','chromEnd','name','avg_expression','strand','rep',
'perc_tissues', 'nr_prots', 'annotation',
'gene_name', 'gene_id', 'repSite_signals','stringency_level']+tissues]
# rename tissues
tissue_rename_dict = {}
tissue_rename_dict['trachea']= 'tracheal epithelium'
tissue_rename_dict['nose']= 'nasal mucosa'
tissue_rename_dict['kidney']= 'kidney parenchyma'
tissue_rename_dict['intestine'] = 'intestine'
tissue_rename_dict['bone'] = 'intervertebral disc'
tissue_rename_dict['penis'] = 'corpus cavernosum'
SCINPAS_tsv = SCINPAS_tsv.rename(columns=tissue_rename_dict)
SCINPAS_tsv.to_csv(file_paths[organism+'_output_PAS_tsv_gz'],sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE,compression='gzip')
In [374]:
file_paths[organism+'_output_PAS_bed_gz'],file_paths[organism+'_output_PAS_tsv_gz']
Out[374]:
('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/atlas.clusters.3.0.GRCh38.GENCODE_42.bed.gz',
'/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/atlas.clusters.3.0.GRCh38.GENCODE_42.tsv.gz')
mouse¶
In [177]:
organism = 'mouse'
num_of_samples = 188
organism_label = organism if organism!='celegans' else 'worm'
df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'
SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)
ourdir = os.path.dirname(file_paths[organism+'_output_PAS_bed_gz'])
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [178]:
tissues = ['Tongue', 'Bladder', 'Kidney', 'unknown', 'Spleen', 'Fat', 'Marrow',
'Lung', 'Aorta', 'Heart', 'MammaryGland', 'LimbMuscle', 'Liver', 'Skin',
'Pancreas', 'Thymus', 'LargeIntestine', 'Trachea']
In [179]:
###
# make bed file
###
mp_cols = [col for col in SCINPAS.columns if 'MP' in col]
v3_point = get_points_others(SCINPAS, 'v3')
results = get_points(SCINPAS, mp_cols)
results.append(v3_point)
results_df = pd.DataFrame(results, columns=['source', 'input_thr', 'num_pas', 'motif_percentage'])
results_df['num_pas'] = results_df['num_pas'].astype('int')
results_df['motif_percentage'] = results_df['motif_percentage'].astype('float')
results_df = results_df.sort_values('motif_percentage',ascending=True).reset_index(drop=True)
results_df['short_MP'] = np.round(results_df['motif_percentage'],0).astype('int')
In [180]:
results_df
Out[180]:
| source | input_thr | num_pas | motif_percentage | short_MP | |
|---|---|---|---|---|---|
| 0 | v3 | full | 1750661 | 31.755434 | 32 |
| 1 | v3 | 10_MP | 1750231 | 31.762379 | 32 |
| 2 | v3 | 15_MP | 1749436 | 31.774240 | 32 |
| 3 | v3 | 20_MP | 1747582 | 31.798565 | 32 |
| 4 | v3 | 25_MP | 1742464 | 31.857588 | 32 |
| 5 | v3 | 30_MP | 1720494 | 32.077066 | 32 |
| 6 | v3 | 35_MP | 1659463 | 32.592953 | 33 |
| 7 | v3 | 40_MP | 1509646 | 33.636826 | 34 |
| 8 | v3 | 45_MP | 1159635 | 35.511950 | 36 |
| 9 | v3 | 50_MP | 812441 | 38.166464 | 38 |
| 10 | v3 | 60_MP | 374502 | 46.313504 | 46 |
| 11 | v3 | 65_MP | 255694 | 51.276135 | 51 |
| 12 | v3 | 70_MP | 178719 | 56.678921 | 57 |
| 13 | v3 | 75_MP | 126435 | 62.408352 | 62 |
| 14 | v3 | 80_MP | 88566 | 68.975679 | 69 |
| 15 | v3 | 85_MP | 61528 | 76.241711 | 76 |
| 16 | v3 | 90_MP | 43620 | 84.094452 | 84 |
| 17 | v3 | 95_MP | 30230 | 93.691697 | 94 |
In [181]:
mp_to_report_df = results_df.drop_duplicates('short_MP').reset_index(drop=True)
SCINPAS['full'] = 1
SCINPAS['alt_score'] = SCINPAS[list(mp_to_report_df['input_thr'])].mul(list(mp_to_report_df['short_MP'])).max(1) # stringency
SCINPAS['alt_id'] = SCINPAS.apply(lambda x:':'.join((x['id'].split(':')[:3])),1)
# SCINPAS['%_of_samples_with_support'] = np.round(SCINPAS['supp']/num_of_samples*100,3)
SCINPAS['%_of_tissues_with_support'] = np.round(100*np.count_nonzero(SCINPAS[tissues],1)/len(tissues),1) # calculate percentage of tissues instead of samples
SCINPAS['num_of_protocols'] = 1
SCINPAS['avg_expression'] = np.round(SCINPAS[tissues].mean(1),6) # mean of means
tmp = SCINPAS[['segment_class']].drop_duplicates().reset_index(drop=True)
PAS_cat_dict = {'TE':'TE','I':'IN','E':'EX','A':'AL','D_I':'DI','U_I':'UI','N':'NA'}
tmp['PAS_cat'] = tmp['segment_class'].map(PAS_cat_dict)
if 'PAS_cat' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('PAS_cat',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='segment_class')
motif_cols = ['AAUAAA', 'AUUAAA','UAUAAA', 'AAGAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA','GAUAAA', 'ACUAAA', 'AAUAGA']
def fill_motif(x):
res = ';'.join([elem for elem in motif_cols if x[elem]==1])
if res=='':
res = 'NaN'
return res
tmp = SCINPAS[motif_cols].drop_duplicates().reset_index(drop=True)
tmp['motif'] = tmp.apply(lambda x: fill_motif(x), 1)
if 'motif' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('motif',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on=motif_cols)
tmp = SCINPAS[['seqid']].drop_duplicates().reset_index(drop=True)
tmp['chr'] = tmp['seqid'].str.replace('chr','').str.replace('X','20').str.replace('Y','21').astype('int')
if 'chr' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('chr',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='seqid')
SCINPAS = SCINPAS.sort_values(['chr','start','end','strand']).reset_index(drop=True)
SCINPAS[['seqid','start','end','alt_id','avg_expression',
'strand','%_of_tissues_with_support',
'num_of_protocols','alt_score','PAS_cat','motif']].to_csv(file_paths[organism+'_output_PAS_bed_gz'],
sep=str('\t'),header=False,index=None,
quoting=csv.QUOTE_NONE,compression='gzip')
In [182]:
len(SCINPAS)
Out[182]:
1750661
In [183]:
Counter(SCINPAS['PAS_cat'])
Out[183]:
Counter({'DI': 451686,
'UI': 287219,
'TE': 157405,
'IN': 693688,
'EX': 79934,
'AL': 80615,
'NA': 114})
In [184]:
###
# correct tsv file
###
input_gtf = pd.read_csv(file_paths[organism+'_annotation_file'], delimiter = '\t', header = None, skiprows=5)
input_gtf[[0,1,2,5,6,7]] = input_gtf[[0,1,2,5,6,7]].astype('category') # to decrease size of the DF
input_gtf[[3,4]] = input_gtf[[3,4]].astype('int')
genes = input_gtf.loc[input_gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('";',expand=True)[0]
genes['gene_name'] = genes[8].str.split('gene_name "',expand=True)[1].str.split('";',expand=True)[0]
SCINPAS_tsv = SCINPAS.copy()
SCINPAS_tsv = pd.merge(SCINPAS_tsv,genes[['gene_id','gene_name']].drop_duplicates().reset_index(drop=True).rename(columns = {'gene_id':'reassigned_g'}),how='left',on='reassigned_g')
SCINPAS_tsv['rep'] = SCINPAS_tsv.apply(lambda x:int(x['alt_id'].split(':')[1]),1)
SCINPAS_tsv = SCINPAS_tsv.rename(columns={'seqid':'chrom','start':'chromStart','end':'chromEnd','alt_id':'name','score':'score_original','alt_score':'stringency_level','strand':'strand',
'%_of_tissues_with_support':'perc_tissues','num_of_protocols':'nr_prots','PAS_cat':'annotation','gene_id':'gene_id_original','reassigned_g':'gene_id','motif':'repSite_signals'})
SCINPAS_tsv[['chrom','chromStart','chromEnd','name','avg_expression','strand','rep',
'perc_tissues', 'nr_prots', 'annotation',
'gene_name', 'gene_id', 'repSite_signals','stringency_level']+tissues].to_csv(file_paths[organism+'_output_PAS_tsv_gz'],
sep=str('\t'),header=True,index=None,
quoting=csv.QUOTE_NONE,compression='gzip')
In [185]:
file_paths[organism+'_output_PAS_bed_gz'],file_paths[organism+'_output_PAS_tsv_gz']
Out[185]:
('/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/atlas.clusters.3.0.GRCm38.GENCODE_M25.bed.gz',
'/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/atlas.clusters.3.0.GRCm38.GENCODE_M25.tsv.gz')
worm¶
In [186]:
organism = 'celegans'
num_of_samples = 55
organism_label = organism if organism!='celegans' else 'worm'
df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'
SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)
ourdir = os.path.dirname(file_paths[organism+'_output_PAS_bed_gz'])
command = 'mkdir -p '+ourdir
out = subprocess.check_output(command, shell=True)
In [187]:
###
# make bed file
###
mp_cols = [col for col in SCINPAS.columns if 'MP' in col]
v3_point = get_points_others(SCINPAS, 'v3')
results = get_points(SCINPAS, mp_cols)
results.append(v3_point)
results_df = pd.DataFrame(results, columns=['source', 'input_thr', 'num_pas', 'motif_percentage'])
results_df['num_pas'] = results_df['num_pas'].astype('int')
results_df['motif_percentage'] = results_df['motif_percentage'].astype('float')
results_df = results_df.sort_values('motif_percentage',ascending=True).reset_index(drop=True)
results_df['short_MP'] = np.round(results_df['motif_percentage'],0).astype('int')
mp_to_report_df = results_df.drop_duplicates('short_MP').reset_index(drop=True)
In [ ]:
In [188]:
mp_to_report_df
Out[188]:
| source | input_thr | num_pas | motif_percentage | short_MP | |
|---|---|---|---|---|---|
| 0 | v3 | full | 66458 | 51.137561 | 51 |
| 1 | v3 | 25_MP | 65738 | 51.592686 | 52 |
| 2 | v3 | 40_MP | 62236 | 53.264991 | 53 |
| 3 | v3 | 45_MP | 58803 | 54.799925 | 55 |
| 4 | v3 | 50_MP | 54481 | 56.830822 | 57 |
| 5 | v3 | 60_MP | 44154 | 62.583231 | 63 |
| 6 | v3 | 65_MP | 39067 | 66.035273 | 66 |
| 7 | v3 | 70_MP | 34024 | 69.850694 | 70 |
| 8 | v3 | 75_MP | 28943 | 74.083544 | 74 |
| 9 | v3 | 80_MP | 22572 | 79.651781 | 80 |
| 10 | v3 | 85_MP | 17502 | 85.327391 | 85 |
| 11 | v3 | 90_MP | 13779 | 90.420205 | 90 |
| 12 | v3 | 95_MP | 7625 | 95.619672 | 96 |
In [189]:
SCINPAS['full'] = 1
SCINPAS['alt_score'] = SCINPAS[list(mp_to_report_df['input_thr'])].mul(list(mp_to_report_df['short_MP'])).max(1) # stringency
SCINPAS['alt_id'] = SCINPAS.apply(lambda x:':'.join((x['id'].split(':')[:3])),1)
SCINPAS['%_of_samples_with_support'] = np.round(SCINPAS['supp']/num_of_samples*100,3)
SCINPAS['num_of_protocols'] = 1
SCINPAS['avg_expression'] = np.round(SCINPAS['score'],6) # just average, since we don't have tissues
tmp = SCINPAS[['segment_class']].drop_duplicates().reset_index(drop=True)
PAS_cat_dict = {'TE':'TE','I':'IN','E':'EX','A':'AL','D_I':'DI','U_I':'UI','N':'NA'}
tmp['PAS_cat'] = tmp['segment_class'].map(PAS_cat_dict)
if 'PAS_cat' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('PAS_cat',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='segment_class')
motif_cols = ['AAUAAA', 'AUUAAA','UAUAAA', 'AAGAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA','GAUAAA', 'ACUAAA', 'AAUAGA']
def fill_motif(x):
res = ';'.join([elem for elem in motif_cols if x[elem]==1])
if res=='':
res = 'NaN'
return res
tmp = SCINPAS[motif_cols].drop_duplicates().reset_index(drop=True)
tmp['motif'] = tmp.apply(lambda x: fill_motif(x), 1)
if 'motif' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('motif',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on=motif_cols)
tmp = SCINPAS[['seqid']].drop_duplicates().reset_index(drop=True)
map_dict = {'I':1,'II':2,'III':3,'IV':'4','V':5,'X':6}
tmp['chr'] = tmp['seqid'].map(map_dict)
tmp['new_seqid'] = 'chr'+tmp['seqid'] # to comply with UCSC format
if 'chr' in list(SCINPAS.columns):
SCINPAS = SCINPAS.drop('chr',1)
SCINPAS = pd.merge(SCINPAS,tmp,how='left',on='seqid')
SCINPAS = SCINPAS.sort_values(['chr','start','end','strand']).reset_index(drop=True)
SCINPAS[['new_seqid','start','end','alt_id','avg_expression',
'strand','%_of_samples_with_support',
'num_of_protocols','alt_score','PAS_cat','motif']].to_csv(file_paths[organism+'_output_PAS_bed_gz'],
sep=str('\t'),header=False,index=None,
quoting=csv.QUOTE_NONE,compression='gzip')
In [190]:
len(SCINPAS)
Out[190]:
66458
In [191]:
Counter(SCINPAS['PAS_cat'])
Out[191]:
Counter({'DI': 15113,
'TE': 26294,
'IN': 9342,
'EX': 11353,
'UI': 3393,
'AL': 963})
In [194]:
###
# correct tsv file
###
input_gtf = pd.read_csv(file_paths[organism+'_annotation_file'], delimiter = '\t', header = None, skiprows=5)
input_gtf[[0,1,2,5,6,7]] = input_gtf[[0,1,2,5,6,7]].astype('category') # to decrease size of the DF
input_gtf[[3,4]] = input_gtf[[3,4]].astype('int')
genes = input_gtf.loc[input_gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('";',expand=True)[0]
genes['gene_name'] = genes[8].str.split('gene_name "',expand=True)[1].str.split('";',expand=True)[0]
SCINPAS_tsv = SCINPAS.copy()
SCINPAS_tsv = pd.merge(SCINPAS_tsv,genes[['gene_id','gene_name']].drop_duplicates().reset_index(drop=True).rename(columns = {'gene_id':'reassigned_g'}),how='left',on='reassigned_g')
SCINPAS_tsv['rep'] = SCINPAS_tsv.apply(lambda x:int(x['alt_id'].split(':')[1]),1)
SCINPAS_tsv = SCINPAS_tsv.rename(columns={'new_seqid':'chrom','start':'chromStart','end':'chromEnd','alt_id':'name','score':'score_original','alt_score':'stringency_level','strand':'strand',
'%_of_samples_with_support':'perc_samples','num_of_protocols':'nr_prots','PAS_cat':'annotation','gene_id':'gene_id_original','reassigned_g':'gene_id','motif':'repSite_signals'})
SCINPAS_tsv[['chrom','chromStart','chromEnd','name','avg_expression','strand','rep',
'perc_samples', 'nr_prots', 'annotation',
'gene_name', 'gene_id', 'repSite_signals','stringency_level']].to_csv(file_paths[organism+'_output_PAS_tsv_gz'],
sep=str('\t'),header=True,index=None,
quoting=csv.QUOTE_NONE,compression='gzip')
In [193]:
file_paths[organism+'_output_PAS_bed_gz'],file_paths[organism+'_output_PAS_tsv_gz']
Out[193]:
('/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/polyAsite_Atlas_3/atlas.clusters.3.0.WBcel235.WormBase_WS293.bed.gz',
'/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/polyAsite_Atlas_3/atlas.clusters.3.0.WBcel235.WormBase_WS293.tsv.gz')
In [ ]:
In [ ]:
In [ ]:
Human¶
In [375]:
cur_dir = '/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/'
organism = 'human'
organism_github_subfolder = 'GRCh38'
github_branch = 'dev'
chrom_sizes_file = file_paths[organism+'_chrom_sizes']
github_dir = subdirs[organism+'_github_dir']
out_dir = github_dir+'total/'
tmp_dir = cur_dir+'tmp/'
trackhub_chunks_dir = tmp_dir+organism+'/'
big_data_dir = subdirs['shared_project_dir']+(organism if organism!='celegans' else 'c_elegans')+'/polyAsite_Atlas_3/for_trackhub/'
out = subprocess.check_output('mkdir -p '+tmp_dir +' '+out_dir+' '+trackhub_chunks_dir+' '+big_data_dir, shell=True)
SCINPAS_tsv = pd.read_csv(file_paths[organism+'_output_PAS_tsv_gz'],delimiter="\t",index_col=None,header=0,compression='gzip')
In [4]:
# SCINPAS_tsv = SCINPAS_tsv.loc[SCINPAS_tsv['chrom']=='chr3'].reset_index(drop=True) # temporary filtration, for testing
In [377]:
len(SCINPAS_tsv)
Out[377]:
18432135
In [378]:
# renamed tissues!
tissues = ['brain', 'bloodImmune','eye', 'intervertebral disc', 'nasal mucosa', 'skin', 'corpus cavernosum', 'heart', 'breast',
'lung', 'tracheal epithelium', 'pancreas','intestine','kidney parenchyma', 'liver', 'uterus', 'ureter','prostate']
In [24]:
# make bigbeds for annotation and bigwigs with average RPM, and tissue-specific RPM values
strands = {'+':'plus','-':'minus'}
command = ''
stringency_groups = {'20-21':[20, 21],'22-29':[23, 26, 29],'30-100':[32, 36, 39, 47, 51, 57, 62, 69, 77, 87, 98]}
for stringency_group in stringency_groups:
l = stringency_groups[stringency_group]
for strand in strands:
SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['stringency_level'].isin(l))&(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
### bigbed - average
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','name','stringency_level','strand']].to_csv(out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed'+' > '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
command = command+'bedToBigBed -type=bed6 '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bb && '
command = command+'rm '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
### bigwig - average
strand_long_label = strands[strand]
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','avg_expression']].to_csv(tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph > '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
command = command+'bedGraphToBigWig '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bw && '
command = command+'rm '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
command
Out[24]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.20-21.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.20-21.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.20-21.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.20-21.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.20-21.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.20-21.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.20-21.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.20-21.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.22-29.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.22-29.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.22-29.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.22-29.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.22-29.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.22-29.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.22-29.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.22-29.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.30-100.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.30-100.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.30-100.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.30-100.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.brain.30-100.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.brain.30-100.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/total/PAS.prostate.30-100.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.prostate.30-100.minus.sorted.bedgraph && '
In [382]:
### bigwig - tissue-specific
strands = {'+':'plus','-':'minus'}
test_out_dir = github_dir+'over_tissues/'
out = subprocess.check_output('mkdir -p '+test_out_dir, shell=True)
command = ''
for strand in strands:
strand_long_label = strands[strand]
# SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
for tissue in tissues:
outfile = test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bw'
if not((os.path.isfile(outfile) and (os.stat(outfile).st_size > 0))):
# SCINPAS_tsv_cur[['chrom','chromStart','chromEnd',tissue]].to_csv(test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph" > "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph" && '
command = command+'bedGraphToBigWig "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph" '+chrom_sizes_file+' "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bw" && '
command = command+'rm "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph" "'+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph" && '
command
Out[382]:
'sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.plus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.intervertebral disc.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.nasal mucosa.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.corpus cavernosum.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.tracheal epithelium.minus.sorted.bedgraph" && sort -k1,1 -k2,2n "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.bedgraph" > "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.sorted.bedgraph" && bedGraphToBigWig "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.sorted.bedgraph" /scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38.chrom.sizes "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.bw" && rm "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.bedgraph" "/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCh38/over_tissues/PAS.kidney parenchyma.minus.sorted.bedgraph" && '
In [388]:
# create text chunks for trackhub configuration
max_average = str(int(SCINPAS_tsv['avg_expression'].max()))
max_tissue_level = str(int(SCINPAS_tsv[tissues].max(1).max()))
stringency_groups = {'20-21':[20, 21],'22-29':[23, 26, 29],'30-100':[32, 36, 39, 47, 51, 57, 62, 69, 77, 87, 98]}
strands = {'+':'plus','-':'minus'}
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")
command = ''
bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 20
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html
"""
f.write(bigbed_composite)
i = 0
for stringency_group in stringency_groups:
for strand in strands:
bigbed = """track PAS_catalogue_"""+str(stringency_group)+'_'+strand+"""
parent PAS_catalogue on
shortLabel """+str(stringency_group)+"""% motif presence, """+strand+""" strand
longLabel """+('1. PAS - full catalogue' if i==0 else '')+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand+'.total.bb'+"""
"""
f.write(bigbed)
i=i+1
multiwig = """track PAS_average_RPM
visibility full
shortLabel PAS mean RPM
longLabel 2. PAS, average RPM across tissues
container multiWig
aggregate transparentOverlay
showSubtrackColorOnUi on
type bigWig 0 """+max_average+"""
viewLimits 0:"""+max_average+"""
autoScale on
maxHeightPixels 90:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html
"""
f.write(multiwig)
for stringency_group in stringency_groups:
for strand in strands:
strand_long_label = strands[strand]
multiwig_subtrack = 'track PAS_'+str(stringency_group)+'_'+strand_long_label+'_strand'+"""
shortLabel RPM, """+str(stringency_group)+'%, '+strand+"""
longLabel PAS average RPM, for PAS with """+str(stringency_group)+"""% motif presence, on a ("""+strand+""") strand
parent PAS_average_RPM"""+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_average+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html
"""
f.write(multiwig_subtrack)
i=3
for strand in strands:
strand_long_label = strands[strand]
multiwig = """track PAS_tissue_RPM_"""+strand_long_label+"""
visibility dense
shortLabel PAS tissue RPM ("""+strand+""") strand
longLabel """+str(i)+""". PAS tissue RPM ("""+strand+""") strand
container multiWig
aggregate none
showSubtrackColorOnUi on
type bigWig 0 """+max_tissue_level+"""
viewLimits 0:"""+max_tissue_level+"""
autoScale on
maxHeighPixels 120:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html
"""
f.write(multiwig)
i=i+1
for tissue in tissues:
subtrack = """track PAS_"""+tissue.replace(' ','_')+"""_RPM_"""+strand_long_label+"""_strand
shortLabel """+tissue+"""
longLabel PAS """+tissue+""" RPM ("""+strand+""") strand
parent PAS_tissue_RPM_"""+strand_long_label+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_tissue_level+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/over_tissues/"""+'PAS.'+tissue+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html
"""
f.write(subtrack)
f.close()
Mouse¶
In [43]:
cur_dir = '/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/'
organism = 'mouse'
organism_github_subfolder = 'GRCm38'
github_branch = 'dev'
chrom_sizes_file = file_paths[organism+'_chrom_sizes']
github_dir = subdirs[organism+'_github_dir']
out_dir = github_dir+'total/'
tmp_dir = cur_dir+'tmp/'
trackhub_chunks_dir = tmp_dir+organism+'/'
big_data_dir = subdirs['shared_project_dir']+(organism if organism!='celegans' else 'c_elegans')+'/polyAsite_Atlas_3/for_trackhub/'
out = subprocess.check_output('mkdir -p '+tmp_dir +' '+out_dir+' '+trackhub_chunks_dir+' '+big_data_dir, shell=True)
SCINPAS_tsv = pd.read_csv(file_paths[organism+'_output_PAS_tsv_gz'],delimiter="\t",index_col=None,header=0,compression='gzip')
In [4]:
# SCINPAS_tsv = SCINPAS_tsv.loc[SCINPAS_tsv['chrom']=='chr3'].reset_index(drop=True) # temporary filtration, for testing
In [44]:
len(SCINPAS_tsv)
Out[44]:
1750661
In [46]:
tissues = ['Bladder', 'Liver', 'Kidney','Spleen', 'Pancreas','LargeIntestine','Lung', 'Trachea',
'Fat','MammaryGland', 'Marrow', 'Aorta', 'Heart','Tongue',
'LimbMuscle', 'Skin', 'Thymus',
'unknown']
In [47]:
# make bigbeds for annotation and bigwigs with average RPM, and tissue-specific RPM values
strands = {'+':'plus','-':'minus'}
command = ''
stringency_group = 'all'
for strand in strands:
SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
### bigbed - average
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','name','stringency_level','strand']].to_csv(out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed'+' > '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
command = command+'bedToBigBed -type=bed6 '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bb && '
command = command+'rm '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
### bigwig - average
strand_long_label = strands[strand]
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','avg_expression']].to_csv(tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph > '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
command = command+'bedGraphToBigWig '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bw && '
command = command+'rm '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
command
Out[47]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/total/PAS.all.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && '
In [49]:
### bigwig - tissue-specific
strands = {'+':'plus','-':'minus'}
test_out_dir = github_dir+'over_tissues/'
out = subprocess.check_output('mkdir -p '+test_out_dir, shell=True)
command = ''
for strand in strands:
strand_long_label = strands[strand]
SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
for tissue in tissues:
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd',tissue]].to_csv(test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph > '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph && '
command = command+'bedGraphToBigWig '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bw && '
command = command+'rm '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.bedgraph '+test_out_dir+'PAS.'+tissue+'.'+strand_long_label+'.sorted.bedgraph && '
command
Out[49]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Bladder.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Liver.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Kidney.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Spleen.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Pancreas.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LargeIntestine.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Lung.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Trachea.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Fat.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.MammaryGland.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Marrow.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Aorta.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Heart.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Tongue.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.LimbMuscle.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Skin.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.Thymus.minus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/mus_musculus/mm10.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/GRCm38/over_tissues/PAS.unknown.minus.sorted.bedgraph && '
In [51]:
# create text chunks for trackhub configuration
max_average = str(int(SCINPAS_tsv['avg_expression'].max()))
max_tissue_level = str(int(SCINPAS_tsv[tissues].max(1).max()))
strands = {'+':'plus','-':'minus'}
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")
command = ''
stringency_group = 'all'
bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 30
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html
"""
f.write(bigbed_composite)
i = 0
for strand in strands:
bigbed = """track PAS_catalogue_"""+strand+"""
parent PAS_catalogue on
shortLabel """+strand+""" strand
longLabel """+('1. PAS - full catalogue' if i==0 else '')+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand+'.total.bb'+"""
"""
f.write(bigbed)
i=i+1
multiwig = """track PAS_average_RPM
visibility full
shortLabel PAS mean RPM
longLabel 2. PAS, average RPM across tissues
container multiWig
aggregate transparentOverlay
showSubtrackColorOnUi on
type bigWig 0 """+max_average+"""
viewLimits 0:"""+max_average+"""
autoScale on
maxHeightPixels 90:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html
"""
f.write(multiwig)
for strand in strands:
strand_long_label = strands[strand]
multiwig_subtrack = 'track PAS_'+strand_long_label+'_strand'+"""
shortLabel RPM, """+strand+"""
longLabel PAS average RPM, for PAS on a ("""+strand+""") strand
parent PAS_average_RPM"""+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_average+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html
"""
f.write(multiwig_subtrack)
i=3
for strand in strands:
strand_long_label = strands[strand]
multiwig = """track PAS_tissue_RPM_"""+strand_long_label+"""
visibility dense
shortLabel PAS tissue RPM ("""+strand+""") strand
longLabel """+str(i)+""". PAS tissue RPM ("""+strand+""") strand
container multiWig
aggregate none
showSubtrackColorOnUi on
type bigWig 0 """+max_tissue_level+"""
viewLimits 0:"""+max_tissue_level+"""
autoScale on
maxHeighPixels 120:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html
"""
f.write(multiwig)
i=i+1
for tissue in tissues:
subtrack = """track PAS_"""+tissue+"""_RPM_"""+strand_long_label+"""_strand
shortLabel """+tissue+"""
longLabel PAS """+tissue+""" RPM ("""+strand+""") strand
parent PAS_tissue_RPM_"""+strand_long_label+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_tissue_level+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/over_tissues/"""+'PAS.'+tissue+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_tissue_RPM_"""+strand_long_label+""".html
"""
f.write(subtrack)
f.close()
Worm¶
In [55]:
cur_dir = '/scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/'
organism = 'celegans'
organism_github_subfolder = 'WBcel235'
github_branch = 'dev'
chrom_sizes_file = file_paths[organism+'_chrom_sizes']
github_dir = subdirs[organism+'_github_dir']
out_dir = github_dir+'total/'
tmp_dir = cur_dir+'tmp/'
trackhub_chunks_dir = tmp_dir+organism+'/'
big_data_dir = subdirs['shared_project_dir']+(organism if organism!='celegans' else 'c_elegans')+'/polyAsite_Atlas_3/for_trackhub/'
out = subprocess.check_output('mkdir -p '+tmp_dir +' '+out_dir+' '+trackhub_chunks_dir+' '+big_data_dir, shell=True)
SCINPAS_tsv = pd.read_csv(file_paths[organism+'_output_PAS_tsv_gz'],delimiter="\t",index_col=None,header=0,compression='gzip')
In [4]:
# SCINPAS_tsv = SCINPAS_tsv.loc[SCINPAS_tsv['chrom']=='chr3'].reset_index(drop=True) # temporary filtration, for testing
In [56]:
len(SCINPAS_tsv)
Out[56]:
66458
In [59]:
# make bigbeds for annotation and bigwigs with average RPM, and tissue-specific RPM values
strands = {'+':'plus','-':'minus'}
command = ''
stringency_group = 'all'
for strand in strands:
SCINPAS_tsv_cur = SCINPAS_tsv.loc[(SCINPAS_tsv['strand']==strand)].reset_index(drop=True)
### bigbed - average
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','name','stringency_level','strand']].to_csv(out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed'+' > '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
command = command+'bedToBigBed -type=bed6 '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bb && '
command = command+'rm '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.bed '+out_dir+'PAS.'+str(stringency_group)+'.'+strand+'.total.sorted.bed && '
### bigwig - average
strand_long_label = strands[strand]
SCINPAS_tsv_cur[['chrom','chromStart','chromEnd','avg_expression']].to_csv(tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = command+'sort -k1,1 -k2,2n '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph > '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
command = command+'bedGraphToBigWig '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph '+chrom_sizes_file+' '+out_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bw && '
command = command+'rm '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.bedgraph '+tmp_dir+'PAS.'+str(stringency_group)+'.'+strand_long_label+'.sorted.bedgraph && '
command
Out[59]:
'sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.+.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.plus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.plus.sorted.bedgraph && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.bed > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.sorted.bed && bedToBigBed -type=bed6 /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.sorted.bed /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.bb && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.bed /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.-.total.sorted.bed && sort -k1,1 -k2,2n /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph > /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && bedGraphToBigWig /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph /scicore/home/zavolan/GROUP/Genomes/c_elegans/ce11.chrom.sizes /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/github/polyAsite_Atlas_3/WBcel235/total/PAS.all.minus.bw && rm /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.bedgraph /scicore/home/zavolan/mirono0000/Projects/SCINPAS_catalog/tmp/PAS.all.minus.sorted.bedgraph && '
In [61]:
# create text chunks for trackhub configuration
max_average = str(int(SCINPAS_tsv['avg_expression'].max()))
strands = {'+':'plus','-':'minus'}
f = open(trackhub_chunks_dir+'hub_total.'+organism+'.txt', "w")
command = ''
stringency_group = 'all'
bigbed_composite = """track PAS_catalogue
compositeTrack on
allButtonPair on
visibility squish
shortLabel PAS catalogue
longLabel 1. PAS - full catalogue
type bigBed 6 .
spectrum on
scoreMax 100
scoreMin 30
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_catalogue.html
"""
f.write(bigbed_composite)
i = 0
for strand in strands:
bigbed = """track PAS_catalogue_"""+strand+"""
parent PAS_catalogue on
shortLabel """+strand+""" strand
longLabel """+('1. PAS - full catalogue' if i==0 else '')+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand+'.total.bb'+"""
"""
f.write(bigbed)
i=i+1
multiwig = """track PAS_average_RPM
visibility full
shortLabel PAS mean RPM
longLabel 2. PAS, average RPM
container multiWig
aggregate transparentOverlay
showSubtrackColorOnUi on
type bigWig 0 """+max_average+"""
viewLimits 0:"""+max_average+"""
autoScale on
maxHeightPixels 90:60:8
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html
"""
f.write(multiwig)
for strand in strands:
strand_long_label = strands[strand]
multiwig_subtrack = 'track PAS_'+strand_long_label+'_strand'+"""
shortLabel RPM, """+strand+"""
longLabel PAS average RPM, for PAS on a ("""+strand+""") strand
parent PAS_average_RPM"""+"""
color """+('4,177,216' if strand=='+' else '255,68,51')+"""
type bigWig 0 """+max_average+"""
bigDataUrl https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/total/PAS."""+str(stringency_group)+'.'+strand_long_label+""".bw
html https://raw.githubusercontent.com/zavolanlab/polyAsite_Atlas_3/"""+github_branch+"""/"""+organism_github_subfolder+"""/track_descriptions/PAS_average_RPM.html
"""
f.write(multiwig_subtrack)
f.close()
Supplementary figure about overlaps¶
In [285]:
pas = pd.read_csv(subdirs['temp_dir']+'v3.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
v2 = pd.read_csv(subdirs['temp_dir']+'v2.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
deep = pd.read_csv(subdirs['temp_dir']+'DL.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0)
mp_cols = [col for col in pas.columns if 'MP' in col]
--------------------------------------------------------------------------- FileNotFoundError Traceback (most recent call last) Cell In[285], line 3 1 pas = pd.read_csv(subdirs['temp_dir']+'v3.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0) 2 v2 = pd.read_csv(subdirs['temp_dir']+'v2.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0) ----> 3 deep = pd.read_csv(subdirs['temp_dir']+'DL.human.human.with_segment_class.tsv',delimiter="\t",index_col=None,header=0) File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs) 209 else: 210 kwargs[new_arg_name] = new_arg_value --> 211 return func(*args, **kwargs) File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs) 325 if len(args) > num_allow_args: 326 warnings.warn( 327 msg.format(arguments=_format_argument_list(allow_args)), 328 FutureWarning, 329 stacklevel=find_stack_level(), 330 ) --> 331 return func(*args, **kwargs) File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options) 935 kwds_defaults = _refine_defaults_read( 936 dialect, 937 delimiter, (...) 946 defaults={"delimiter": ","}, 947 ) 948 kwds.update(kwds_defaults) --> 950 return _read(filepath_or_buffer, kwds) File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds) 602 _validate_names(kwds.get("names", None)) 604 # Create the parser. --> 605 parser = TextFileReader(filepath_or_buffer, **kwds) 607 if chunksize or iterator: 608 return parser File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds) 1439 self.options["has_index_names"] = kwds["has_index_names"] 1441 self.handles: IOHandles | None = None -> 1442 self._engine = self._make_engine(f, self.engine) File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine) 1733 if "b" not in mode: 1734 mode += "b" -> 1735 self.handles = get_handle( 1736 f, 1737 mode, 1738 encoding=self.options.get("encoding", None), 1739 compression=self.options.get("compression", None), 1740 memory_map=self.options.get("memory_map", False), 1741 is_text=is_text, 1742 errors=self.options.get("encoding_errors", "strict"), 1743 storage_options=self.options.get("storage_options", None), 1744 ) 1745 assert self.handles is not None 1746 f = self.handles.handle File ~/miniconda3/envs/py310/lib/python3.10/site-packages/pandas/io/common.py:856, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options) 851 elif isinstance(handle, str): 852 # Check whether the filename is to be opened in binary mode. 853 # Binary mode does not support 'encoding' and 'newline'. 854 if ioargs.encoding and "b" not in ioargs.mode: 855 # Encoding --> 856 handle = open( 857 handle, 858 ioargs.mode, 859 encoding=ioargs.encoding, 860 errors=errors, 861 newline="", 862 ) 863 else: 864 # Binary mode 865 handle = open(handle, ioargs.mode) FileNotFoundError: [Errno 2] No such file or directory: '/scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/DL.human.human.with_segment_class.tsv'
Overlapping of v3 with v2¶
In [292]:
df1 = pas[['seqid','start','end','strand','segment_class']+mp_cols].copy()
df2 = v2[['seqid','start','end','strand','segment_class']].copy()
df1['id_short'] = df1.index
df1['score_bed'] = 0
df1[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
df2['id_short'] = df2.index
df2['score_bed'] = 0
df2[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df2.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df1.non_sorted.bed > '+subdirs['temp_dir']+'df1.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df2.non_sorted.bed > '+subdirs['temp_dir']+'df2.sorted.bed'
out = subprocess.check_output(command, shell=True)
In [ ]:
###
# v2 out of v3
###
In [328]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']
df_of_label = 'v2.0'
df_in_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']
In [293]:
command = 'bedtools closest -d -s -a '+subdirs['temp_dir']+'df1.sorted.bed -b '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df1_df2.intersection.bed'
command
Out[293]:
'bedtools closest -d -s -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1_df2.intersection.bed'
In [325]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df1_df2.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in B with one in A.
df_intersected.columns = ['id_short','id_B','dist']
df_intersected = pd.merge(df1[['id_short','segment_class']+mp_cols_to_analyze],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [329]:
len(df_intersected),len(df1)
Out[329]:
(18432135, 18432135)
In [330]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
sns.set(font_scale = 1)
sns.set_style("white")
fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))
j=0
for big_cat_val in big_cat_vals:
df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
i=0
for mp_level in mp_cols_to_analyze:
df_sel = df_intersected_cat.loc[df_intersected_cat[mp_level]==1].reset_index(drop=True)
ax = sns.ecdfplot(ax=axes[j],data = df_sel,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
i=i+1
ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
if j==2:
ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
ax.tick_params(bottom=True,left=False)
if j==0:
ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
ax.tick_params(bottom=True,left=True)
ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
# set tick points
l = [0,1,2,3,4,5]
ax.set_xticks(l)
j=j+1
ax.legend(title="Motif presence in "+df_in_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1) # Set legend with no title
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
In [265]:
###
# v3 out of v2
###
In [314]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
for mp_level in mp_cols_to_analyze:
df1_sel = df1.loc[df1[mp_level]==1].reset_index(drop=True)
df1_sel[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed > '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools closest -d -s -b '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed -a '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed'
out = subprocess.check_output(command, shell=True)
print(mp_level+' done')
In [332]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']
df_in_label = 'v2.0'
df_of_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']
mp_level = '10_MP'
In [333]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
sns.set(font_scale = 1)
sns.set_style("white")
fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))
i=0
for mp_level in mp_cols_to_analyze:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in A with one in B.
df_intersected.columns = ['id_B','id_short','dist']
df_intersected = pd.merge(df2[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
j=0
for big_cat_val in big_cat_vals:
df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
if j==2:
ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
ax.tick_params(bottom=True,left=False)
if j==0:
ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
ax.tick_params(bottom=True,left=True)
ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
# set tick points
l = [0,1,2,3,4,5]
ax.set_xticks(l)
j=j+1
i=i+1
ax.legend(title="Motif presence in "+df_of_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1) # Set legend with no title
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
Overlapping of v3 with DL¶
In [335]:
df1 = pas[['seqid','start','end','strand','segment_class']+mp_cols].copy()
df2 = deep[['seqid','start','end','strand','segment_class']].copy()
df1['id_short'] = df1.index
df1['score_bed'] = 0
df1[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
df2['id_short'] = df2.index
df2['score_bed'] = 0
df2[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df2.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df1.non_sorted.bed > '+subdirs['temp_dir']+'df1.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df2.non_sorted.bed > '+subdirs['temp_dir']+'df2.sorted.bed'
out = subprocess.check_output(command, shell=True)
In [ ]:
###
# v2 out of v3
###
In [336]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']
df_of_label = 'DL'
df_in_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']
In [337]:
command = 'bedtools closest -d -s -a '+subdirs['temp_dir']+'df1.sorted.bed -b '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df1_df2.intersection.bed'
command
Out[337]:
'bedtools closest -d -s -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1_df2.intersection.bed'
In [338]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df1_df2.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in B with one in A.
df_intersected.columns = ['id_short','id_B','dist']
df_intersected = pd.merge(df1[['id_short','segment_class']+mp_cols_to_analyze],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [339]:
len(df_intersected),len(df1)
Out[339]:
(18432135, 18432135)
In [341]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
sns.set(font_scale = 1)
sns.set_style("white")
fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))
j=0
for big_cat_val in big_cat_vals:
df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
i=0
for mp_level in mp_cols_to_analyze:
df_sel = df_intersected_cat.loc[df_intersected_cat[mp_level]==1].reset_index(drop=True)
ax = sns.ecdfplot(ax=axes[j],data = df_sel,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
i=i+1
ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
if j==2:
ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
ax.tick_params(bottom=True,left=False)
if j==0:
ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
ax.tick_params(bottom=True,left=True)
ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
# set tick points
l = [0,1,2,3,4,5]
ax.set_xticks(l)
j=j+1
ax.legend(title="Motif presence in "+df_in_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1) # Set legend with no title
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
In [342]:
###
# v3 out of v2
###
In [343]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
for mp_level in mp_cols_to_analyze:
df1_sel = df1.loc[df1[mp_level]==1].reset_index(drop=True)
df1_sel[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df1_'+mp_level+'.non_sorted.bed > '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools closest -d -s -b '+subdirs['temp_dir']+'df1_'+mp_level+'.sorted.bed -a '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed'
out = subprocess.check_output(command, shell=True)
print(mp_level+' done')
10_MP done 75_MP done 90_MP done
In [344]:
mp_cols_to_analyze = ['10_MP','75_MP','90_MP']
palette = ['teal','darkviolet','red']
df_in_label = 'DL'
df_of_label = 'v3.0'
mp_labels_to_show = ['20%','62%','87%']
mp_level = '10_MP'
In [346]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
sns.set(font_scale = 1)
sns.set_style("white")
fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))
i=0
for mp_level in mp_cols_to_analyze:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df2_df1_'+mp_level+'.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in A with one in B.
df_intersected.columns = ['id_B','id_short','dist']
df_intersected = pd.merge(df2[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
j=0
for big_cat_val in big_cat_vals:
df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=palette[i],label=(mp_labels_to_show[i] if j==len(big_cat_vals)-1 else None))
ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
if j==2:
ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
ax.tick_params(bottom=True,left=False)
if j==0:
ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
ax.tick_params(bottom=True,left=True)
ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
# set tick points
l = [0,1,2,3,4,5]
ax.set_xticks(l)
j=j+1
i=i+1
ax.legend(title="Motif presence in "+df_of_label+", %",bbox_to_anchor=(1.01, 0.5),loc=3,borderaxespad=0.0,ncol=1) # Set legend with no title
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
Overlapping of v2 with DL¶
In [347]:
df1 = v2[['seqid','start','end','strand','segment_class']].copy()
df2 = deep[['seqid','start','end','strand','segment_class']].copy()
df1['id_short'] = df1.index
df1['score_bed'] = 0
df1[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df1.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
df2['id_short'] = df2.index
df2['score_bed'] = 0
df2[['seqid','start','end','id_short','score_bed','strand']].to_csv(subdirs['temp_dir']+'df2.non_sorted.bed', sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df1.non_sorted.bed > '+subdirs['temp_dir']+'df1.sorted.bed'
out = subprocess.check_output(command, shell=True)
command = 'bedtools sort -i '+subdirs['temp_dir']+'df2.non_sorted.bed > '+subdirs['temp_dir']+'df2.sorted.bed'
out = subprocess.check_output(command, shell=True)
In [ ]:
###
# v2 out of v3
###
In [349]:
color = 'orange'
df_of_label = 'DL'
df_in_label = 'v2.0'
In [348]:
command = 'bedtools closest -d -s -a '+subdirs['temp_dir']+'df1.sorted.bed -b '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df1_df2.intersection.bed'
command
Out[348]:
'bedtools closest -d -s -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1_df2.intersection.bed'
In [350]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df1_df2.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in B with one in A.
df_intersected.columns = ['id_short','id_B','dist']
df_intersected = pd.merge(df1[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [351]:
len(df_intersected),len(df1)
Out[351]:
(568608, 568608)
In [355]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
sns.set(font_scale = 1)
sns.set_style("white")
fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))
j=0
for big_cat_val in big_cat_vals:
df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=color)
ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
if j==2:
ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
ax.tick_params(bottom=True,left=False)
if j==0:
ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
ax.tick_params(bottom=True,left=True)
ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
# set tick points
l = [0,1,2,3,4,5]
ax.set_xticks(l)
j=j+1
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
In [356]:
###
# v3 out of v2
###
In [358]:
command = 'bedtools closest -d -s -b '+subdirs['temp_dir']+'df1.sorted.bed -a '+subdirs['temp_dir']+'df2.sorted.bed'+' | cut -f4,9,13 > '+subdirs['temp_dir']+'df2_df1.intersection.bed'
command
Out[358]:
'bedtools closest -d -s -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df1.sorted.bed -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2.sorted.bed | cut -f4,9,13 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/temp/df2_df1.intersection.bed'
In [359]:
color = 'green'
df_in_label = 'DL'
df_of_label = 'v2.0'
In [360]:
df_intersected = pd.read_csv(subdirs['temp_dir']+'df2_df1.intersection.bed', delimiter = '\t', header = None)
df_intersected = df_intersected.drop_duplicates(0).reset_index(drop=True) # when there is an overlap of several pas in A with one in B.
df_intersected.columns = ['id_B','id_short','dist']
df_intersected = pd.merge(df2[['id_short','segment_class']],df_intersected[['id_short','dist']],how='left',on='id_short')
df_intersected['dist_log10'] = np.log10(df_intersected['dist']+1)
In [362]:
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
sns.set(font_scale = 1)
sns.set_style("white")
fig, axes = plt.subplots(1, len(big_cat_vals), sharey=True, sharex=True, figsize = (2*len(big_cat_vals), 3))
j=0
for big_cat_val in big_cat_vals:
df_intersected_cat = df_intersected.loc[df_intersected[big_cat]==big_cat_val].reset_index(drop=True)
ax = sns.ecdfplot(ax=axes[j],data = df_intersected_cat,x='dist_log10',color=color)
ax.set(xlabel = '',title=big_cat_labels[j],ylabel='')
if j==2:
ax.set(xlabel = 'distance to nearest PAS in '+df_of_label+', $log_{10}$ bp')
ax.tick_params(bottom=True,left=False)
if j==0:
ax.set(ylabel='proportion of PAS from '+df_in_label+'\n\nwith matched PAS from '+df_of_label)
ax.tick_params(bottom=True,left=True)
ax.set(xlim=(-0.1,6),ylim=(-0.01,1.01))
# set tick points
l = [0,1,2,3,4,5]
ax.set_xticks(l)
j=j+1
i=i+1
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'catalog_comparison/', shell=True)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'catalog_comparison/overlap_prop_'+df_of_label+'_in_'+df_in_label+'.pdf',bbox_inches='tight',dpi=600)
Figure 2¶
In [7]:
df = pd.read_csv("/scicore/home/zavolan/moon0000/tissue_specific_pas/75motif_presence_filtered_additional_tissue_specific_ward_30_numpas_3_10.bed",delimiter="\t",index_col=None,header=0)
tissues = ['prostate', 'skin', 'penis', 'intestine', 'heart', 'kidney', 'breast',
'lung', 'uterus', 'nose', 'pancreas', 'trachea', 'bone', 'eye', 'liver','ureter', 'brain','bloodImmune']
usage_cols = ['usage_'+elem for elem in tissues]
quantile_columns = ["q5", "q10", "q15", "q20", "q80", "q90", "q95", "q99"]
miv_val = df.loc[df['mean_denom']>0]['mean_denom'].min()
df['mean_denom_log2'] = np.log2(df['mean_denom']+miv_val)
###
# calculate number of PAS per gene
###
df['num_PASs'] = 1
gr = df.groupby(['reassigned_g']).agg({'num_PASs':np.sum}).reset_index()
df = pd.merge(df.drop('num_PASs',1),gr,how='left',on='reassigned_g')
In [8]:
###
# calculate number of tissues with no gene expression
###
calc_dict = {}
for elem in tissues:
calc_dict[elem] = np.sum
gr = df.groupby(['reassigned_g','num_PASs']).agg(calc_dict).reset_index()
def low_q(x):
return np.quantile(x,0.01)
calc_dict = {}
for elem in tissues:
calc_dict[elem] = low_q
gr1 = gr.groupby('num_PASs').agg(calc_dict).reset_index()
gr1['min_val'] = gr1[tissues].min(1)
gr = pd.merge(gr,gr1[['num_PASs','min_val']],how='left',on='num_PASs')
gr['num_no_expr'] = gr[tissues].le(list(gr['min_val']),axis=0).sum(1)
a = (~gr[tissues].le(list(gr['min_val']),axis=0))
a[~a] = np.nan
gr = pd.concat([gr[['reassigned_g','num_no_expr']],a],axis=1)
rename_dict = {}
for elem in tissues:
rename_dict[elem] = elem+'_expr'
gr = gr.rename(columns=rename_dict)
if 'num_no_expr' in df.columns:
df = df.drop('num_no_expr',1)
df = pd.merge(df,gr,how='left',on='reassigned_g')
expr_status_cols = [elem+'_expr' for elem in tissues]
df['std_adj'] = df[usage_cols].mul(df[expr_status_cols].values,axis=0).std(1)
In [9]:
df['num_no_expr'].quantile(0.9)
Out[9]:
5.0
In [10]:
# select num of PAS
num_PASs_selection = (3,20)
scaling_value = 'q95'
scaling_value_ubiq = 'q20'
# df_filtered = df[df['mean_denom'] > threshold]
df_filtered = df.loc[(df['mean_denom_log2']>-1)&(df['num_no_expr']<1)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
df_filtered['avg_usage'] = df_filtered[usage_cols].mean(1) # mean of means
df_filtered['ts'] = (df_filtered['std']>df_filtered[scaling_value]).astype('int')
df_filtered['ubiq'] = ((df_filtered['avg_usage']>0.5)&(df_filtered['std']<df_filtered[scaling_value_ubiq])).astype('int')
df_filtered['low'] = ((df_filtered['avg_usage']<0.5)&(df_filtered['std']<df_filtered[scaling_value_ubiq])).astype('int')
df_filtered['cat'] = df_filtered['ts']*3+df_filtered['ubiq']*2+df_filtered['low']*1
# Set figure size as requested
sns.set(font_scale = 0.5)
sns.set_style("white")
fig, ax = plt.subplots(1, 1, sharey=False, sharex=False, figsize = (2.8, 1))
x_feature,y_feature = 'mean_denom_log2','std'
xlabel, ylabel = 'tissue-avg host gene expression, $log_2$ RPM','st.dev. of PAS usage\nover tissues'
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==0], color = 'royalblue', alpha = 0.03, s = 0.5)
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==3], color = 'green', alpha = 0.06, s = 0.5)
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==1], color = 'magenta', alpha = 0.06, s = 2.5)
ax = sns.scatterplot(x = x_feature, y = y_feature, data = df_filtered.loc[df_filtered['cat']==2], color = 'orange', alpha = 1, s = 2.5)
quantiles_to_show = ["q20", "q95"]
palette = ['brown','green']
scaling_value = 'q95'
tmp = df[quantiles_to_show+[x_feature]].drop_duplicates().reset_index(drop=True)
i = 0
for quantile_ in quantiles_to_show:
ax = sns.lineplot(x = x_feature, y = quantile_, data = tmp, label = quantile_, linewidth = 1, color = palette[i])
i=i+1
ax.set(xlim = (-1,10),ylim=(-0.03,0.5),xlabel=xlabel,ylabel=ylabel)
ax.tick_params(left=True, bottom=True)
ax.legend(title="quantile of st. dev.",bbox_to_anchor=(0.2, 1),loc=3,borderaxespad=0.0,ncol=2) # Set legend with no title
# out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
# fig.savefig(subdirs['figures_dir']+'over_tissues/2A.png',bbox_inches='tight',dpi=600)
# fig.savefig(subdirs['figures_dir']+'over_tissues/2A.pdf',bbox_inches='tight',dpi=600)
Out[10]:
<matplotlib.legend.Legend at 0x14e175289c90>
In [12]:
len(df_filtered['reassigned_g'].unique())
Out[12]:
11888
In [78]:
Counter(df_filtered['cat'])
Out[78]:
Counter({0: 82781, 3: 4465, 1: 20358, 2: 163})
In [218]:
sns.set(font_scale = 0.5)
sns.set_style("white")
fig, axes = plt.subplots(1, 1, sharey=False, sharex=True, figsize = (2.8, 1))
bins_non_ts = list(pd.Series(range(0,105,5))/100)
bins_ts = list(pd.Series(range(0,105,5))/100)
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==0],x='avg_usage',color = 'royalblue')
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==3],x='avg_usage',color = 'green', alpha = 1, )
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==2],x='avg_usage',color = 'orange', alpha = 1,)
ax = sns.ecdfplot(data = df_filtered.loc[df_filtered['cat']==1],x='avg_usage',color = 'magenta', alpha = 1,)
ax.set(ylabel='CDF',xlabel='average PAS usage over tissues',ylim=(-0.01,1.01))
ax.tick_params(left=True, bottom=True)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==0],y='std',x='avg_usage',color = 'royalblue', alpha = 0.03, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==3],y='std',x='avg_usage',color = 'green', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==2],y='std',x='avg_usage',color = 'orange', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==1],y='std',x='avg_usage',color = 'magenta', alpha = 0.2, s = 0.5)
# ax.tick_params(left=True, bottom=True)
# ax.set(xlabel='average PAS usage over tissues',ylabel='st.dev. of PAS usage\nover tissues')
# out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
# fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.png',bbox_inches='tight',dpi=600)
# fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.pdf',bbox_inches='tight',dpi=600)
In [221]:
sns.set(font_scale = 0.5)
sns.set_style("white")
fig, axes = plt.subplots(1, 1, sharey=False, sharex=True, figsize = (2.8, 1))
bins_non_ts = list(pd.Series(range(0,105,5))/100)
bins_ts = list(pd.Series(range(0,105,5))/100)
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==1],x='avg_usage',color = 'magenta', alpha = 1, stat='density',bins=bins_non_ts,element='step')
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==0],x='avg_usage',color = 'royalblue', alpha = 1, stat='density',bins=bins_non_ts,element='step')
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==3],x='avg_usage',color = 'green', alpha = 1, stat='density',bins=bins_ts,element='step')
ax = sns.histplot(data = df_filtered.loc[df_filtered['cat']==2],x='avg_usage',color = 'orange', alpha = 1, stat='density',bins=bins_non_ts,element='step')
ax.set(ylabel='Density',xlabel='average PAS usage over tissues')
ax.tick_params(left=True, bottom=True)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==0],y='std',x='avg_usage',color = 'royalblue', alpha = 0.03, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==3],y='std',x='avg_usage',color = 'green', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==2],y='std',x='avg_usage',color = 'orange', alpha = 0.2, s = 0.5)
# ax = sns.scatterplot(ax=axes[1],data = df_filtered.loc[df_filtered['cat']==1],y='std',x='avg_usage',color = 'magenta', alpha = 0.2, s = 0.5)
# ax.tick_params(left=True, bottom=True)
# ax.set(xlabel='average PAS usage over tissues',ylabel='st.dev. of PAS usage\nover tissues')
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/suppl_avg_usage_vs_std.pdf',bbox_inches='tight',dpi=600)
In [27]:
df_filtered['t'] = 1
df_filtered['std_1'] = np.round(df_filtered['std'],2)
gr = df_filtered.groupby('std_1').agg({'t':np.sum}).reset_index().sort_values('t',ascending=False)
sns.histplot(df_filtered.loc[df_filtered['std_1'].isin([0.23,0.24])]['num_no_expr'],color='red',stat='density',bins=range(0,30))
sns.histplot(df_filtered.loc[~df_filtered['std_1'].isin([0.23,0.24])]['num_no_expr'],color='blue',stat='density',bins=range(0,30))
Out[27]:
<AxesSubplot: xlabel='num_no_expr', ylabel='Density'>
In [558]:
# experimental, does not work that good
# scaling_value = 'q90'
# df_scores = pd.concat([df_filtered[['id','reassigned_g','std',scaling_value]],(df_filtered[usage_cols].sub(df_filtered[usage_cols].mean(1),axis=0)).div(df_filtered[scaling_value],axis=0)],axis=1)
In [487]:
num_PASs_selection = (3,10)
scaling_value = 'q95'
data = df.loc[(df['std']>df[scaling_value])&(df['mean_denom_log2']>-1.5)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
len(data)
Out[487]:
6575
In [488]:
# smth is wrong with cluster assignment
data = df.loc[df['Cluster']!='not_considered'].reset_index(drop=True)
len(data),data['mean_denom_log2'].min()
Out[488]:
(1644, -1.4603511946072305)
In [2]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
In [4]:
import scipy.cluster
In [ ]:
scipy.
In [195]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from matplotlib.colors import Normalize, LinearSegmentedColormap
scaling_value = 'q95'
method = 'ward'
num_PASs_selection = (3,20)
n_clusters = 30
data = df.loc[(df['std']>df[scaling_value])&(df['mean_denom_log2']>(-1))&(df['num_no_expr']<1)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
# data = df.loc[(df['std']>df[scaling_value])&(df['mean_denom_log2']>-1)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
# data = df.loc[df['Cluster']!='not_considered'].reset_index(drop=True) # there, filtering by the number of PAS is done after selection by quantile and mean_denom_log2
ids = data['id']
data = data[usage_cols]
# Standardize the data
# scaler = StandardScaler()
# scaled_data = scaler.fit_transform(data)
scaled_data = data.values
# Perform hierarchical clustering
linkage_matrix = linkage(scaled_data, method = method)
# Generate dendrogram to capture leaf order
dendro = dendrogram(linkage_matrix, no_plot = True)
ordered_leaves = dendro['leaves']
# Assign cluster labels based on the number of clusters
cluster_labels = fcluster(linkage_matrix, t = n_clusters, criterion = 'maxclust')
data['Cluster'] = cluster_labels
# Reorder the data according to the dendrogram leaves
data['DendroOrder'] = ordered_leaves
# Sort by Cluster and Dendrogram, and sort 'id' column in the same order
data_sorted = data.sort_values(by = ['Cluster', 'DendroOrder'])
ids_sorted = ids.iloc[data_sorted.index] # Sort 'id' alongside data
# Create a color palette for clusters
unique_clusters = np.unique(cluster_labels)
palette = sns.color_palette('hls', len(unique_clusters))
# Map clusters to colors
cluster_color_map = {cluster: palette[i] for i, cluster in enumerate(unique_clusters)}
row_colors = data_sorted['Cluster'].map(cluster_color_map)
# track row positions
data_sorted['row_pos'] = list(range(0,len(data_sorted)))
# Drop 'Cluster' and 'DendroOrder' for the heatmap, but keep the sorted 'id'
data_transformed = data_sorted.drop(columns = ['Cluster', 'DendroOrder','row_pos'])
# add id so that we can then retrieve cluster for PAS, save to data_clustered dataframe
data_sorted['id'] = list(ids_sorted)
data_clustered = data_sorted.copy().reset_index(drop=True)
# Custom colormap for the heatmap
cmap = LinearSegmentedColormap.from_list("custom_cmap", ["white", "blue"])
norm = Normalize(vmin=data_transformed.min().min(), vmax=data_transformed.max().max())
data_transformed.columns = [col.replace('usage_', '') for col in data_transformed.columns]
# rename tissues
tissue_rename_dict = {}
tissue_rename_dict['trachea']= 'tracheal epithelium'
tissue_rename_dict['nose']= 'nasal mucosa'
tissue_rename_dict['kidney']= 'kidney parenchyma'
tissue_rename_dict['intestine'] = 'intestine'
tissue_rename_dict['bone'] = 'intervertebral disc'
tissue_rename_dict['penis'] = 'corpus cavernosum'
data_transformed = data_transformed.rename(columns = tissue_rename_dict)
In [197]:
sns.set(font_scale = 0.5)
sns.set_style("white")
# Generate the clustermap
g = sns.clustermap(data_transformed, method=method, row_cluster=False, col_cluster=True, vmin=0,vmax=1,
row_colors=list(row_colors), cmap=cmap, norm=norm, figsize=(5.5, 5.5), cbar_kws = {'label':'PAS usage'}, cbar_pos=(0.95, 0.5, 0.02, 0.18)) # Adjust the 'shrink' value to make the color bar narrower
# Adjust x-tick labels
x_labels = g.ax_heatmap.get_xticklabels()
# Reapply the customized labels
g.ax_heatmap.set_xticklabels(labels = x_labels, rotation=60, ha='right',va='top',rotation_mode='anchor')
g.ax_heatmap.tick_params(right=False, bottom=True,width=0.5)
g.ax_heatmap.set_yticklabels(labels = [])
g.ax_heatmap.text(-1.2,int(len(data_transformed)/2),'cluster',rotation=90)
g.ax_cbar.tick_params(width=0.5)
gr = data_sorted.groupby('Cluster').agg({'row_pos':max}).reset_index()
for index,row in gr.head(len(gr)-1).iterrows():
g.ax_heatmap.text(-0.65,row['row_pos'],'----',color='black',va='center')
# g.ax_heatmap.text(-1.65,row['row_pos'],str(row['Cluster']),color='black',va='center')
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
g.savefig(subdirs['figures_dir']+'over_tissues/2B.png',bbox_inches='tight',dpi=600)
g.savefig(subdirs['figures_dir']+'over_tissues/2B.pdf',bbox_inches='tight',dpi=600)
In [28]:
# we need this for the correct segment class
organism = 'human'
organism_label = organism if organism!='celegans' else 'worm'
df_name = 'v3.'+organism_label
v3_pas_dir = subdirs['temp_dir']+df_name+'.with_segment_class.tsv'
SCINPAS = pd.read_csv(v3_pas_dir,delimiter="\t",index_col=None,header=0)
In [80]:
# tissue-specific vs non-tissue-specific
num_PASs_selection = (3,20)
data = df.loc[(df['mean_denom_log2']>(-1))&(df['num_no_expr']==0)&(df['num_PASs']>=num_PASs_selection[0])&(df['num_PASs']<=num_PASs_selection[1])].reset_index(drop=True)
data = pd.merge(data,SCINPAS[['id','segment_class']],how='left',on='id')
data = pd.merge(data.drop('Cluster',1),data_clustered[['id','Cluster']],how='left',on='id')
data['Cluster'] = data['Cluster'].fillna(-1).astype('int')
In [83]:
scaling_value = 'q95'
scaling_value_ubiq = 'q20'
data['avg_usage'] = data[usage_cols].mean(1) # mean of means
data['ts'] = (data['std']>data[scaling_value]).astype('int')
data['ubiq'] = ((data['avg_usage']>0.5)&(data['std']<data[scaling_value_ubiq])).astype('int')
data['low'] = ((data['avg_usage']<0.5)&(data['std']<data[scaling_value_ubiq])).astype('int')
data['cat'] = data['ts']*3+data['ubiq']*2+data['low']*1
In [84]:
Counter(data['cat'])
Out[84]:
Counter({0: 82781, 3: 4465, 1: 20358, 2: 163})
In [85]:
# try to separate particular clusters
from scipy.stats import binom
data['t']=1
gr = data.groupby(['Cluster','segment_class']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('Cluster').agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on='Cluster')
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
In [90]:
# try to separate tissue-specific vs non-tissue-specific
from scipy.stats import binom
data['t']=1
gr = data.groupby(['cat','segment_class']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('cat').agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on='cat')
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
big_cat = 'segment_class'
big_cat_vals = ['TE','E','I','A','D_I','U_I','N']
big_cat_labels = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic','ambigous']
map_dict = {}
i=0
for elem in big_cat_vals:
map_dict[elem] = big_cat_labels[i]
i=i+1
gr['segment_class_long'] = gr['segment_class'].map(map_dict)
map_dict = {0:'other PAS',1:'rarely used PAS',2:'ubiquitous PAS',3:'tissue-specific PAS',}
gr['ts'] = gr['cat'].map(map_dict)
In [91]:
gr.loc[gr['segment_class']=='TE']
Out[91]:
| cat | segment_class | t | t_sum | % | prop | %_ci_up | %_ci_down | segment_class_long | ts | |
|---|---|---|---|---|---|---|---|---|---|---|
| 4 | 0 | TE | 26453 | 82781 | 31.96 | 0.319554 | 32.27 | 31.64 | terminal exon | other PAS |
| 10 | 1 | TE | 3577 | 20358 | 17.57 | 0.175705 | 18.10 | 17.05 | terminal exon | rarely used PAS |
| 15 | 2 | TE | 133 | 163 | 81.60 | 0.815951 | 87.12 | 75.46 | terminal exon | ubiquitous PAS |
| 21 | 3 | TE | 2630 | 4465 | 58.90 | 0.589026 | 60.34 | 57.45 | terminal exon | tissue-specific PAS |
In [128]:
from statsmodels.stats import proportion as smprop
x_feature, y_feature, hue_feature = 'segment_class_long', '%','ts'
order = ['terminal exon','exonic','intronic','alternative\n(exon/intron)','downstream\nintergenic','upstream\nintergenic']
hue_order = ['rarely used PAS','other PAS','tissue-specific PAS','ubiquitous PAS']
palette = ['magenta','royalblue','green','orange']
dodge = 1.012
# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
reorder_dict_x[x_val] = i
i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
reorder_dict_hue[hue_val] = i
i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']+dodge*((gr_reordered['hue_order']+1)/(len(hue_order)+1)-0.5)
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2, 1))
# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette)
ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])],
elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")
ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel='% of PAS in class')
ax.tick_params(left=True, bottom=True,width=0.5)
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C.pdf',bbox_inches='tight',dpi=600)
In [132]:
# association with motif presence
data['t']=1
gr = data.groupby(['cat','all_motif']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('cat').agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on='cat')
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['all_motif']==1].reset_index(drop=True)
map_dict = {0:'other PAS',1:'rarely used PAS',2:'ubiquitous PAS',3:'tissue-specific PAS',}
gr['ts'] = gr['cat'].map(map_dict)
from statsmodels.stats import proportion as smprop
x_feature, y_feature, hue_feature = 'all_motif', '%','ts'
order = [1]
hue_order = ['rarely used PAS','other PAS','tissue-specific PAS','ubiquitous PAS']
palette = ['magenta','royalblue','green','orange']
dodge = 1.012
# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
reorder_dict_x[x_val] = i
i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
reorder_dict_hue[hue_val] = i
i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']+dodge*((gr_reordered['hue_order']+1)/(len(hue_order)+1)-0.5)
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(0.35, 1))
# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette)
ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]-0.5), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])],
elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")
ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel='Motif presence, %',xticks=[])
ax.tick_params(left=False, bottom=False,right=True,labelright=True,labelleft=False,width=0.5)
ax.yaxis.set_label_position("right")
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C_add.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/2C_add.pdf',bbox_inches='tight',dpi=600)
Prepare metadata¶
human¶
In [133]:
merged_num_PAS_each_class = pd.read_csv('./merged_num_PAS_each_class.csv',delimiter=",",index_col=None,header=0)
human_cell_ATLAS_metadata_fastq_final_final_alter = pd.read_csv('./human_cell_ATLAS_metadata_fastq_final_final_alter.csv',delimiter=",",index_col=None,header=0)
NeMO_manifest_metadata_23017c0cfd = pd.read_csv('./NeMO_manifest_metadata_23017c0cfd.tsv',delimiter="\t",index_col=None,header=0)
In [134]:
import re
def get_sample(x):
sample = '10X_'+x['project.project_core.project_short_name']+'_'+x['file_name'].split('_')[0].replace('-','')
if len(x['file_name'].split('_'))>1:
tmp = x['file_name'].split('_')[1]
if ((re.match('S[0-9]',tmp) and (
(not x['file_name'].split('_')[0].startswith('HCAHeart')) and
(not x['file_name'].split('_')[0].startswith('SIGAA')) and
(not x['file_name'].split('_')[0].startswith('SIGAC')) and
(not x['file_name'].split('_')[0].startswith('SIGAD4')) and
(not x['file_name'].split('_')[0].startswith('SIGAE4')) and
(not x['file_name'].split('_')[0].startswith('SIGAF4')) and
(not x['file_name'].split('_')[0].startswith('SIGAG4')) and
(not x['file_name'].split('_')[0].startswith('SIGAH4')) and
(not x['file_name'].split('_')[0].startswith('sample')) and
(not x['file_name'].split('_')[0].startswith('SRR')) and
(not x['file_name'].split('_')[0].startswith('CZIKidney')) and
(not x['file_name'].split('_')[0].startswith('3-')))) or
tmp in ['bamtofastq','cd45pos','Cornea','MUC9105'] or
tmp.startswith('MUC11') or
tmp.startswith('Endo')):
sample=sample+tmp
elif x['file_name'].split('_')[1:3] == ['Adult','Cornea']:
sample=sample+'AdultCornea'
elif tmp in ['scRNAseq','HS','D17PrPzF','D35PrTzF','D17PrTzF','D27PrTzF','D35PrPzF','1','4','2','3','5','6','7'] or tmp.startswith('CD45pos') or tmp.startswith('TotalHK'):
sample=sample+tmp+x['file_name'].split('_')[2]
elif tmp in ['2nd','July','June','Wong']:
sample=sample+tmp+x['file_name'].split('_')[2]+x['file_name'].split('_')[3]
else:
if x['file_name'].startswith('HK.') or x['file_name'].startswith('SW'):
sample = '10X_'+x['project.project_core.project_short_name']+'_'+x['file_name'].split('.')[0]
return sample
human_cell_ATLAS_metadata_fastq_final_final_alter['sample'] = human_cell_ATLAS_metadata_fastq_final_final_alter.apply(lambda x:get_sample(x),1)
In [135]:
NeMO_manifest_metadata_23017c0cfd['sample'] = NeMO_manifest_metadata_23017c0cfd.apply(lambda x:x['sample_id'][:3]+'_'+x['sample_id'][3:].replace('-','_').replace(';','_'),1)
In [136]:
tmp1 = pd.merge(human_cell_ATLAS_metadata_fastq_final_final_alter,merged_num_PAS_each_class[['sample','organ']],how='inner',on='sample')
tmp2 = pd.merge(NeMO_manifest_metadata_23017c0cfd,merged_num_PAS_each_class[['sample','organ']],how='inner',on='sample')
In [137]:
len(tmp2['sample'].unique())+len(tmp1['sample'].unique()),len(tmp1['sample'].unique()),len(tmp2['sample'].unique())
Out[137]:
(813, 722, 91)
In [138]:
tmp1[['sample','organ']+list(tmp1.columns[:-2])].to_csv('./HumanCellAtlas_match.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
In [139]:
tmp2[['sample','organ']+list(tmp2.columns[:-2])].to_csv('./Nemo_match.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
In [140]:
# new metadata from Young
final_filtered_human_metadata_240627 = pd.read_csv('./final_filtered_human_metadata_240627.csv',delimiter=",",index_col=None,header=0)
final_filtered_human_metadata_240627 = final_filtered_human_metadata_240627.loc[~final_filtered_human_metadata_240627['scinpas_sample'].isna()].reset_index(drop=True)
final_filtered_human_metadata_240627['DataBase'] = final_filtered_human_metadata_240627.apply(lambda x:'NeMO' if x['scinpas_organ']=='brain' else 'HCA',1)
In [141]:
HCA_full_manifest = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/hca-manifest-1d3efc71-5986-591b-bd99-45b65bce9b09.99950987-0051-50bf-9402-dc99918f99dc.tsv',delimiter="\t",index_col=None,header=0)
HCA_full_manifest['HCA_present'] = 1
In [142]:
test = pd.merge(final_filtered_human_metadata_240627,HCA_full_manifest[['bundle_uuid','HCA_present']],how='left',on='bundle_uuid')
matched = test.loc[((test['scinpas_organ']!='brain')&(~test['HCA_present'].isna()))|(test['scinpas_organ']=='brain')].reset_index(drop=True)
not_matched = test.loc[(test['scinpas_organ']!='brain')&(test['HCA_present'].isna())].reset_index(drop=True)
not_matched['match_by'] = not_matched['file_name'].str.replace('_001.fastq.gz','')
HCA_full_manifest['match_by'] = HCA_full_manifest['file_name'].str.replace('.fastq.gz','')
not_matched = pd.merge(not_matched.drop(['bundle_uuid','HCA_present'],1),HCA_full_manifest[['match_by','bundle_uuid','HCA_present']],how='left',on='match_by')
still_not_matched = not_matched.loc[not_matched['bundle_uuid'].isna()].reset_index(drop=True)
still_not_matched['bundle_uuid'] = 'NA'
now_matched = not_matched.loc[~not_matched['bundle_uuid'].isna()].reset_index(drop=True)
cols = ['scinpas_sample','DataBase','scinpas_organ']+['bundle_uuid',
'file_name',
'file_format',
'file_size',
'cell_suspension.selected_cell_type',
'library_preparation_protocol.library_construction_approach',
'library_preparation_protocol.nucleic_acid_source',
'project.project_core.project_short_name',
'specimen_from_organism.diseases',
'specimen_from_organism.organ',
'specimen_from_organism.organ_part',
'donor_organism.biomaterial_core.biomaterial_id',
'donor_organism.genus_species',
'donor_organism.development_stage',
'donor_organism.diseases',
'donor_organism.organism_age',
'sample.biomaterial_core.biomaterial_id','sequencing_quality']
final_metadata = pd.concat([matched[cols],now_matched[cols],still_not_matched[cols]]).reset_index(drop=True)
final_metadata = final_metadata.drop_duplicates(['DataBase','scinpas_organ','scinpas_sample','file_name']).sort_values(['DataBase','scinpas_organ','scinpas_sample']).reset_index(drop=True)
final_metadata['bundle_uuid'] = final_metadata.apply(lambda x:x['bundle_uuid'] if x['DataBase']=='HCA' else 'NA',1)
In [150]:
# rename tissues
tissue_rename_dict = {}
tissue_rename_dict['trachea']= 'tracheal epithelium'
tissue_rename_dict['nose']= 'nasal mucosa'
tissue_rename_dict['kidney']= 'kidney parenchyma'
tissue_rename_dict['intestine'] = 'intestine'
tissue_rename_dict['bone'] = 'intervertebral disc'
tissue_rename_dict['penis'] = 'corpus cavernosum'
tissue_map = {}
for tissue in list(final_metadata['scinpas_organ'].unique()):
if tissue in list(tissue_rename_dict.keys()):
tissue_map[tissue] = tissue_rename_dict[tissue]
else:
tissue_map[tissue] = tissue
final_metadata['scinpas_organ_new'] = final_metadata['scinpas_organ'].map(tissue_map)
final_metadata['scinpas_organ'] = final_metadata['scinpas_organ_new']
final_metadata = final_metadata.drop('scinpas_organ_new',1)
In [156]:
final_metadata.to_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/sample_and_file_metadata.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
mouse¶
In [62]:
mouse_metadata_final_filtered = pd.read_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/mouse_metadata_final_filtered.csv",delimiter=",",index_col=None,header=0)
mouse_metadata_final_filtered = mouse_metadata_final_filtered.rename(columns={'scinpas':'scinpas_sample','organ':'scinpas_organ'})
mouse_metadata_final_filtered['DataBase'] = 'Tabula Muris Senis'
official_metadata = pd.read_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/GSM4505404_tabula-muris-senis-droplet-official-raw-obj-metadata.csv.gz",delimiter=",",index_col=None,header=0)
official_metadata['sample'] = official_metadata.apply(lambda x:'_'.join(x['cell'].split('_')[:-1]),1)
official_metadata = official_metadata.drop_duplicates('sample').reset_index(drop=True)
mouse_metadata_final_filtered = pd.merge(mouse_metadata_final_filtered,official_metadata[['sample','age','mouse.id','sex','subtissue']],how='left',on='sample')
mouse_metadata_final_filtered = mouse_metadata_final_filtered[['scinpas_sample','DataBase','scinpas_organ','age','mouse.id','sex','subtissue']]
mouse_metadata_final_filtered.to_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/mouse/polyAsite_Atlas_3/sample_and_file_metadata.tsv",sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
worm¶
In [81]:
worm_metadata_final_filtered = pd.read_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/worm_metadata_final_filtered.csv",delimiter=",",index_col=None,header=0)
worm_metadata_final_filtered.columns = ['SRA_id','scinpas_sample','strain']
worm_metadata_final_filtered[['scinpas_sample','SRA_id','strain']].to_csv("/scicore/home/zavolan/GROUP/SCINPAS_catalog/c_elegans/polyAsite_Atlas_3/sample_and_file_metadata.tsv",sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
Random things¶
In [121]:
# make big bed file
a = []
for elem in list('chr'+pd.Series(range(1,22)).astype('str')):
for i in range(0,10**6,100):
a.append([elem,i,i+99])
bed_file = pd.DataFrame(a,columns = ['chr','start','end'])
bed_file['len'] = bed_file['end']-bed_file['start']
bed_file['cumul_len'] = bed_file['len'].cumsum(axis=0)
M = bed_file['len'].sum()
In [136]:
# make a uniform random sample
sample_size = 10**6
from scipy.stats import randint
r = randint.rvs(0, M, size=sample_size)
In [137]:
# now get the actual genomic coordinates
r.sort()
r = pd.Series(r)
In [132]:
genomic_positions = []
chromosomes = []
pred = 0
k=0 # iterator
start_time = time.time()
for index,row in bed_file.iterrows():
r_sub = r.loc[(r<row['cumul_len'])&(r>=pred)]
chr = row['chr']
chromosomes = chromosomes+[chr]*len(r_sub)
genomic_positions = genomic_positions+list(row['start']+r_sub-pred)
pred = row['cumul_len']
if k%5000==0:
print(str(k)+' done, '+str(time.time()-start_time))
k=k+1
0 done, 0.04459881782531738 5000 done, 8.946052551269531 10000 done, 18.63933253288269 15000 done, 29.077177047729492 20000 done, 40.23433971405029 25000 done, 52.232123613357544 30000 done, 64.98941254615784 35000 done, 78.56097292900085 40000 done, 93.05825996398926 45000 done, 108.46607065200806 50000 done, 125.21714735031128 55000 done, 142.62919116020203 60000 done, 161.10411477088928 65000 done, 180.62160897254944 70000 done, 201.32123613357544 75000 done, 222.88662695884705 80000 done, 245.30620408058167 85000 done, 269.0161051750183 90000 done, 294.012845993042 95000 done, 320.641884803772 100000 done, 348.86718225479126 105000 done, 377.6022136211395 110000 done, 407.78930020332336 115000 done, 439.3883044719696 120000 done, 472.32599997520447 125000 done, 506.6241524219513 130000 done, 542.3440065383911 135000 done, 579.6196844577789 140000 done, 618.4077925682068 145000 done, 658.4839911460876 150000 done, 699.9941833019257 155000 done, 742.6552038192749 160000 done, 786.8064415454865 165000 done, 832.4553668498993 170000 done, 879.4657661914825 175000 done, 927.979006767273 180000 done, 978.0892655849457 185000 done, 1029.5985431671143 190000 done, 1082.7922778129578 195000 done, 1136.6507608890533 200000 done, 1192.069009065628 205000 done, 1250.054283618927
In [ ]:
resulting_df = pd.DataFrame([chromosomes,genomic_positions]).transpose()
resulting_df.columns = ['chr','pos']
In [91]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1, 1, sharey=True, sharex=False,figsize=(5,5))
ax = sns.histplot(data = r,stat='proportion',bins=10)
ax.set(xlim=(0,M))
Out[91]:
[(0.0, 16600.0)]
Fraction of intergenic etc across samples and tissues¶
In [4]:
# merged_num_PAS_each_class_rpm = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/merged_num_PAS_each_class_rpm.csv',delimiter=",",index_col=None,header=0)
# merged_num_PAS_each_class_rpm['total'] = merged_num_PAS_each_class_rpm[['intronic', 'exonic', 'TE', 'true_intergenic',
# 'antisense_intronic', 'antisense_exonic', 'antisense_TE']].sum(1)
merged_num_PAS_each_class = pd.read_csv('./merged_num_PAS_each_class.csv',delimiter=",",index_col=None,header=0)
merged_num_PAS_each_class['total'] = merged_num_PAS_each_class[['intronic', 'exonic', 'TE', 'true_intergenic',
'antisense_intronic', 'antisense_exonic', 'antisense_TE']].sum(1)
In [5]:
merged_num_PAS_each_class.head()
Out[5]:
| sample | organ | intronic | exonic | TE | true_intergenic | antisense_intronic | antisense_exonic | antisense_TE | total_PAS | total | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10X_131_1 | brain | 7323 | 939 | 2489 | 2076 | 1809 | 94 | 142 | 814610 | 14872 |
| 1 | 10X_131_2 | brain | 1677 | 294 | 1248 | 511 | 405 | 32 | 41 | 814610 | 4208 |
| 2 | 10X_131_3 | brain | 1258 | 231 | 1129 | 370 | 324 | 22 | 28 | 814610 | 3362 |
| 3 | 10X_131_4 | brain | 940 | 200 | 941 | 297 | 257 | 15 | 27 | 814610 | 2677 |
| 4 | 10X_131_7 | brain | 8052 | 948 | 2554 | 1891 | 1657 | 112 | 131 | 814610 | 15345 |
In [ ]:
In [3]:
lims = (0,6)
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1, 2, sharey=True, sharex=False,figsize=(10,5))
merged_num_PAS_each_class['total_log'] = np.log10(merged_num_PAS_each_class['total']+1)
merged_num_PAS_each_class_rpm['total_log'] = np.log10(merged_num_PAS_each_class_rpm['total']+1)
for category in ['antisense_TE','antisense_exonic','antisense_intronic','true_intergenic','exonic','intronic','TE']:
merged_num_PAS_each_class[category+'_log'] = np.log10(merged_num_PAS_each_class[category]+1)
merged_num_PAS_each_class[category+'_%'] = np.round(merged_num_PAS_each_class[category]/merged_num_PAS_each_class['total']*100,2)
merged_num_PAS_each_class_rpm[category+'_%'] = np.round(merged_num_PAS_each_class_rpm[category]/merged_num_PAS_each_class_rpm['total']*100,2)
ax0 = sns.regplot(ax=axes[0],data = merged_num_PAS_each_class,y=category+'_%',x='total_log',label = category,scatter_kws={'s':5})
ax1 = sns.regplot(ax=axes[1],data = merged_num_PAS_each_class_rpm,y=category+'_%',x='total_log',label = category,scatter_kws={'s':5})
ax0.set(ylabel = '% in PAS class',xlabel = '# total supported PAS, $log_{10}$',title='# supported PAS\n(by a least one read)')
ax1.legend(bbox_to_anchor=(1.05, 1),loc=2,borderaxespad=0,title='PAS class',markerscale=1.5,ncols=1,fontsize=9,mode=None)
ax1.set(ylabel = '',xlabel = '# total polyA reads, $log_{10}$',title='# polyA reads')
# ax.set(xlim=lims,ylim=lims)
Out[3]:
[Text(0, 0.5, ''),
Text(0.5, 0, '# total polyA reads, $log_{10}$'),
Text(0.5, 1.0, '# polyA reads')]
In [5]:
merged_num_PAS_each_class.head()
Out[5]:
| sample | organ | intronic | exonic | TE | true_intergenic | antisense_intronic | antisense_exonic | antisense_TE | total_PAS | ... | antisense_intronic_log | antisense_intronic_% | true_intergenic_log | true_intergenic_% | exonic_log | exonic_% | intronic_log | intronic_% | TE_log | TE_% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10X_131_1 | brain | 7323 | 939 | 2489 | 2076 | 1809 | 94 | 142 | 814610 | ... | 3.257679 | 12.16 | 3.317436 | 13.96 | 2.973128 | 6.31 | 3.864748 | 49.24 | 3.396199 | 16.74 |
| 1 | 10X_131_2 | brain | 1677 | 294 | 1248 | 511 | 405 | 32 | 41 | 814610 | ... | 2.608526 | 9.62 | 2.709270 | 12.14 | 2.469822 | 6.99 | 3.224792 | 39.85 | 3.096562 | 29.66 |
| 2 | 10X_131_3 | brain | 1258 | 231 | 1129 | 370 | 324 | 22 | 28 | 814610 | ... | 2.511883 | 9.64 | 2.569374 | 11.01 | 2.365488 | 6.87 | 3.100026 | 37.42 | 3.053078 | 33.58 |
| 3 | 10X_131_4 | brain | 940 | 200 | 941 | 297 | 257 | 15 | 27 | 814610 | ... | 2.411620 | 9.60 | 2.474216 | 11.09 | 2.303196 | 7.47 | 2.973590 | 35.11 | 2.974051 | 35.15 |
| 4 | 10X_131_7 | brain | 8052 | 948 | 2554 | 1891 | 1657 | 112 | 131 | 814610 | ... | 3.219585 | 10.80 | 3.276921 | 12.32 | 2.977266 | 6.18 | 3.905958 | 52.47 | 3.407391 | 16.64 |
5 rows × 26 columns
In [4]:
merged_num_PAS_each_class_rpm.head()
Out[4]:
| sample | organ | intronic | exonic | TE | true_intergenic | antisense_intronic | antisense_exonic | antisense_TE | total_read | total_PAS | total | total_log | antisense_TE_% | antisense_exonic_% | antisense_intronic_% | true_intergenic_% | exonic_% | intronic_% | TE_% | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10X_131_1 | brain | 10140.0 | 7843.0 | 48520.0 | 4233.0 | 3485.0 | 152.0 | 636.0 | 75009.0 | NaN | 75009.0 | 4.875119 | 0.85 | 0.20 | 4.65 | 5.64 | 10.46 | 13.52 | 64.69 |
| 1 | 10X_131_2 | brain | 1894.0 | 1364.0 | 7893.0 | 683.0 | 576.0 | 28.0 | 96.0 | 12534.0 | NaN | 12534.0 | 4.098124 | 0.77 | 0.22 | 4.60 | 5.45 | 10.88 | 15.11 | 62.97 |
| 2 | 10X_131_3 | brain | 1399.0 | 942.0 | 6219.0 | 528.0 | 381.0 | 16.0 | 77.0 | 9562.0 | NaN | 9562.0 | 3.980594 | 0.81 | 0.17 | 3.98 | 5.52 | 9.85 | 14.63 | 65.04 |
| 3 | 10X_131_4 | brain | 905.0 | 668.0 | 4342.0 | 389.0 | 268.0 | 18.0 | 59.0 | 6649.0 | NaN | 6649.0 | 3.822822 | 0.89 | 0.27 | 4.03 | 5.85 | 10.05 | 13.61 | 65.30 |
| 4 | 10X_131_7 | brain | 11564.0 | 7643.0 | 48780.0 | 4177.0 | 3044.0 | 156.0 | 704.0 | 76068.0 | NaN | 76068.0 | 4.881208 | 0.93 | 0.21 | 4.00 | 5.49 | 10.05 | 15.20 | 64.13 |
In [14]:
merged_num_PAS_each_class_rpm['project'] = merged_num_PAS_each_class_rpm['sample'].str.split('_',expand=True)[1]
merged_num_PAS_each_class_rpm['organ_x_project'] = merged_num_PAS_each_class_rpm['organ']+' '+merged_num_PAS_each_class_rpm['project']
gr2 = merged_num_PAS_each_class_rpm[['organ']].drop_duplicates().reset_index(drop=True)
gr2['color'] = list(sns.color_palette('husl',len(gr2)))
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,5, sharey=False, sharex=False,figsize=(15,5))
gr = merged_num_PAS_each_class_rpm.groupby(['organ','project','organ_x_project','sample']).agg({'total_log':np.median}).reset_index()
gr1 = gr.groupby(['organ']).agg({'total_log':np.median}).reset_index().rename(columns={'total_log':'total_log_organ'})
gr1 = pd.merge(gr1,gr2,how='left',on='organ')
gr1 = gr1.sort_values('total_log_organ',ascending=False).reset_index(drop=True)
ax = sns.swarmplot(ax=axes[0],data = gr,y='organ',order = list(gr1['organ']), palette = list(gr1['color']), x='total_log',edgecolor='black',linewidth=0,s=2)
ax.set(xlabel = '# total polyA reads, $log_{10}$')
i=1
for category in ['true_intergenic','antisense_intronic','intronic','TE']:
gr = merged_num_PAS_each_class_rpm.groupby(['organ','project','organ_x_project','sample']).agg({category+'_%':np.median}).reset_index()
gr1 = gr.groupby(['organ']).agg({category+'_%':np.median}).reset_index().rename(columns={category+'_%':category+'_%_organ'})
gr1 = pd.merge(gr1,gr2,how='left',on='organ')
gr1 = gr1.sort_values(category+'_%_organ',ascending=False).reset_index(drop=True)
gr = pd.merge(gr,gr1,how='left',on='organ')
gr = pd.merge(gr,gr2,how='left',on='organ')
gr = gr.sort_values([category+'_%_organ',category+'_%'],ascending=[False,False]).reset_index(drop=True)
ax = sns.swarmplot(ax=axes[i],data = gr,y='organ',order = list(gr1['organ']), palette = list(gr1['color']), x=category+'_%',edgecolor='black',linewidth=0,s=2)
ax.set(ylabel='')
i=i+1
fig.tight_layout(pad=0.5)
Comparison with polyAsite 2 Atlas¶
In [22]:
merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [3,4,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21,22])
merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
In [23]:
len(merged_pas_motif_table)
Out[23]:
18432135
In [24]:
SCINPAS_full = pd.read_csv('/scicore/home/zavolan/moon0000/GENE_ID/result/organ_score/pas_with_gene_id_v1.0.2_w_organ_score.bed',delimiter="\t",index_col=None,header=0)
In [29]:
tissues = ['nose', 'trachea', 'heart', 'intestine', 'breast', 'bone',
'pancreas', 'eye', 'kidney', 'penis', 'ureter', 'lung', 'liver', 'skin',
'prostate', 'uterus', 'bloodImmune', 'brain']
In [31]:
merged_pas_motif_table = pd.merge(SCINPAS_full[['seqid','start','end','id','score']+tissues].rename(columns={'score':'score_1'}),
merged_pas_motif_table,how='inner',on=['id'])
In [32]:
merged_pas_motif_table = merged_pas_motif_table.drop('score',1).rename(columns={'score_1':'score'}) # score_1 from SCINPAS
In [33]:
len(merged_pas_motif_table)
Out[33]:
18432135
In [35]:
cols = list(merged_pas_motif_table.columns)
motifs = cols[-15:-4]
In [51]:
merged_pas_motif_table[motifs] = merged_pas_motif_table[motifs].astype('boolean')
merged_pas_motif_table['any_canonic_motif'] = (merged_pas_motif_table[motifs].sum(1)>0).astype('int')
In [52]:
motifs_to_search = [elem.replace('U','T') for elem in motifs]
In [53]:
polyAsite = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas/atlas.clusters.2.0.GRCh38.96.bed',delimiter="\t",index_col=None,header=None)
In [54]:
def parse_motifs(x):
if pd.isna(x[10]):
return 0
else:
l = x[10].split(';')
for motif in motifs_to_search:
for elem in l:
if elem.split('@')[0]==motif:
pos = int(elem.split('@')[1])
if pos>=-35 and pos<=-10:
return 1
return 0
polyAsite['any_canonic_motif'] = polyAsite.apply(lambda x: parse_motifs(x),1)
In [55]:
Counter(polyAsite['any_canonic_motif'])
Out[55]:
Counter({1: 342526, 0: 226479})
In [83]:
(342526+226479), 342526/(342526+226479) # t_PAS and % motif in v2 polyAsite Atlas
Out[83]:
(569005, 0.601973620618448)
In [86]:
motif_thresholds = [0.6,0.65,0.7,0.75,0.8,0.85]
a = {}
for tissue in tissues:
tmp = merged_pas_motif_table[['id',tissue,'any_canonic_motif']].sort_values(tissue,ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
for motif_threshold in motif_thresholds:
max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
pas_to_append = list(tmp.loc[0:max_index]['id'])
if a.get(motif_threshold) is not None:
a[motif_threshold] = a[motif_threshold]+pas_to_append
else:
a[motif_threshold] = pas_to_append
print(tissue+' done')
res = {}
summary = []
for motif_threshold in motif_thresholds:
tmp = pd.DataFrame(list(set(a[motif_threshold])),columns = ['id'])
tmp = pd.merge(tmp,merged_pas_motif_table[['id','any_canonic_motif']],how='left',on='id')
summary.append([motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)])
print(str(motif_threshold)+' done')
summary = pd.DataFrame(summary,columns = ['motif_threshold','t_PAS','motif_fraction'])
nose done trachea done heart done intestine done breast done bone done pancreas done eye done kidney done penis done ureter done lung done liver done skin done prostate done uterus done bloodImmune done brain done 0.6 done 0.65 done 0.7 done 0.75 done 0.8 done 0.85 done
In [87]:
summary.columns = ['within_tissue_motif_threshold','# PAS', 'fraction of PAS with motif after union']
In [88]:
summary
Out[88]:
| within_tissue_motif_threshold | # PAS | fraction of PAS with motif after union | |
|---|---|---|---|
| 0 | 0.60 | 310975 | 0.451477 |
| 1 | 0.65 | 224565 | 0.494743 |
| 2 | 0.70 | 160240 | 0.545363 |
| 3 | 0.75 | 113060 | 0.606112 |
| 4 | 0.80 | 78177 | 0.675672 |
| 5 | 0.85 | 51491 | 0.756890 |
Quantify usage instead of RPM¶
In [2]:
merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [3,4,6,7,8,9,11,12,13,14,15,16,17,18,19,20,21,22])
merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
In [3]:
len(merged_pas_motif_table)
Out[3]:
18432135
In [4]:
cols = list(merged_pas_motif_table.columns)
motifs = cols[-15:-4]
motifs
Out[4]:
['AAUAAA', 'AUUAAA', 'UAUAAA', 'AGUAAA', 'AAUACA', 'AAUAUA', 'CAUAAA', 'AAUGAA', 'GAUAAA', 'ACUAAA', 'AAUAGA']
In [93]:
len(motifs)
Out[93]:
11
In [5]:
merged_pas_motif_table[motifs] = merged_pas_motif_table[motifs].astype('boolean')
merged_pas_motif_table['any_canonic_motif'] = (merged_pas_motif_table[motifs].sum(1)>0).astype('int')
In [6]:
SCINPAS_full = pd.read_csv('/scicore/home/zavolan/moon0000/GENE_ID/result/organ_score/pas_with_gene_id_v1.0.2_w_organ_score.bed',delimiter="\t",index_col=None,header=0)
In [7]:
tmp_dir = '/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/'
out = subprocess.check_output('mkdir -p '+tmp_dir, shell=True)
SCINPAS_full['new_id'] = SCINPAS_full.index
SCINPAS_full['score_tmp'] = 1
SCINPAS_full[['seqid','start','end','new_id','score_tmp','strand']].to_csv(tmp_dir+'scinpas_full.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [8]:
'bedtools sort -i '+tmp_dir+'scinpas_full.bed > '+tmp_dir+'scinpas_full.sorted.bed'
Out[8]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/scinpas_full.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/scinpas_full.sorted.bed'
In [8]:
motif_info = pd.merge(SCINPAS_full[['id','new_id']],
merged_pas_motif_table[['id','any_canonic_motif']],how='inner',on=['id'])
In [9]:
len(motif_info)
Out[9]:
18432135
In [7]:
# take GENCODE gtf, extract genes from there, and subset only non-overlapping genes
gtf = pd.read_csv('/scicore/home/zavolan/GROUP/Genomes/homo_sapiens/hg38_v42/gencode.v42.annotation.gtf',delimiter="\t",index_col=None,header=None,skiprows=5)
genes = gtf.loc[gtf[2]=='gene'].reset_index(drop=True)
genes['gene_id'] = genes[8].str.split('gene_id "',expand=True)[1].str.split('"',expand=True)[0]
genes['start'] = genes[3]-1
genes['score_tmp'] = 1
genes = genes.drop_duplicates([0,3,4,6]).reset_index(drop=True)
In [10]:
genes[[0,'start',4,'gene_id','score_tmp',6]].to_csv(tmp_dir+'genes.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [11]:
'bedtools sort -i '+tmp_dir+'genes.bed > '+tmp_dir+'genes.sorted.bed'
Out[11]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.sorted.bed'
In [23]:
'bedtools cluster -d 1001 -s -i '+tmp_dir+'genes.sorted.bed > '+tmp_dir+'genes.clustered.bed'
Out[23]:
'bedtools cluster -d 1001 -s -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.sorted.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.clustered.bed'
In [29]:
genes_clustered = pd.read_csv(tmp_dir+'genes.clustered.bed',delimiter="\t",index_col=None,header=None)
genes_clustered['t']=1
gr = genes_clustered.groupby([6]).agg({'t':np.sum}).reset_index()
non_overlap_genes = pd.merge(genes_clustered,gr.loc[gr['t']==1][[6]].reset_index(drop=True),how='inner',on=[6])
plus = non_overlap_genes.loc[non_overlap_genes[5]=='+'].reset_index(drop=True)
minus = non_overlap_genes.loc[non_overlap_genes[5]=='-'].reset_index(drop=True)
plus[2] = plus[2]+1000 # add downstream 1 kb region to include
minus[1] = minus[1]-1000 # add downstream 1 kb region to include
non_overlap_genes = pd.concat([plus,minus]).reset_index(drop=True)
In [31]:
non_overlap_genes[[0,1,2,3,4,5]].to_csv(tmp_dir+'genes.non_overlap.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [28]:
'bedtools sort -i '+tmp_dir+'genes.non_overlap.bed > '+tmp_dir+'genes.non_overlap.sorted.bed'
Out[28]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.sorted.bed'
In [32]:
len(non_overlap_genes)
Out[32]:
39528
In [42]:
'bedtools intersect -sorted -f 1.0 -s -wa -wb -a '+tmp_dir+'scinpas_full.sorted.bed -b '+tmp_dir+'genes.non_overlap.sorted.bed | cut -f1,4,6,8,9,10 > '+tmp_dir+'intersection.bed'
Out[42]:
'bedtools intersect -sorted -f 1.0 -s -wa -wb -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/scinpas_full.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.sorted.bed | cut -f1,4,6,8,9,10 > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/intersection.bed'
In [10]:
intersection = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/intersection.bed',delimiter="\t",index_col=None,header=None)
intersection.columns = ['chr','new_id','strand','gene_start','gene_end','gene_id']
intersection[['chr','strand']] = intersection[['chr','strand']].astype('category')
In [11]:
intersection['t']=1
gr = intersection.groupby('new_id').agg({'t':sum}).reset_index()
In [12]:
len(intersection)
Out[12]:
5388855
In [13]:
tissues = ['nose', 'trachea', 'heart', 'intestine', 'breast', 'bone',
'pancreas', 'eye', 'kidney', 'penis', 'ureter', 'lung', 'liver', 'skin',
'prostate', 'uterus', 'bloodImmune', 'brain']
In [14]:
df = pd.merge(intersection[['new_id','gene_id']],SCINPAS_full[['new_id','score','class']+tissues],how='inner',on='new_id')
In [15]:
df = pd.merge(df,motif_info[['new_id','any_canonic_motif']],how='inner',on='new_id')
In [16]:
len(df)
Out[16]:
5388855
In [17]:
df['class'] = df['class'].astype('category')
usage_input_cols = ['score']+tissues
group_dict = {}
for col in usage_input_cols:
group_dict[col] = np.sum
gr = df.groupby('gene_id').agg(group_dict).reset_index()
col_to_rename = list(gr.columns[1:])
rename_dict = {}
for col in col_to_rename:
rename_dict[col] = col+'_sum'
gr = gr.rename(columns = rename_dict)
df = pd.merge(df,gr,how='inner',on='gene_id')
In [18]:
denom_usage_input_cols = [elem+'_sum' for elem in usage_input_cols]
In [19]:
ratio_cols = [elem+'_ratio' for elem in usage_input_cols]
df[ratio_cols] = (df[usage_input_cols].values/(df[denom_usage_input_cols]+10**(-30)).values)
In [20]:
df['t']=1
gr = df.groupby('gene_id').agg({'t':sum}).reset_index()
df = pd.merge(df.drop('t',1),gr,how='inner',on='gene_id')
In [21]:
len(df)
Out[21]:
5388855
In [22]:
df = df.loc[df['t']>1].reset_index(drop=True)
In [23]:
len(df)
Out[23]:
5385702
In [24]:
len(df['t'].unique())
Out[24]:
1828
In [ ]:
# what would be the numbers for polyAsite Atlas v2 if we only consider these regions
In [33]:
polyAsite = pd.read_csv('/scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas/atlas.clusters.2.0.GRCh38.96.bed',delimiter="\t",index_col=None,header=None)
In [45]:
polyAsite_short = polyAsite[[0,1,2,3,4,5]]
polyAsite_short[4] = 1
polyAsite_short[0] = 'chr'+polyAsite_short[0].astype('str')
polyAsite_short = polyAsite_short.loc[polyAsite_short[0].isin(list(non_overlap_genes[0].unique()))].reset_index(drop=True)
polyAsite_short.to_csv(tmp_dir+'atlas_v2.bed',sep=str('\t'),header=False,index=None,quoting=csv.QUOTE_NONE)
In [48]:
'bedtools sort -i '+tmp_dir+'atlas_v2.bed > '+tmp_dir+'atlas_v2.sorted.bed'
Out[48]:
'bedtools sort -i /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/atlas_v2.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/atlas_v2.sorted.bed'
In [50]:
'bedtools intersect -sorted -f 1.0 -s -wa -a '+tmp_dir+'atlas_v2.sorted.bed'+' -b '+tmp_dir+'genes.non_overlap.sorted.bed > '+tmp_dir+'Atlas_v2.intersection.bed'
Out[50]:
'bedtools intersect -sorted -f 1.0 -s -wa -a /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/atlas_v2.sorted.bed -b /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/genes.non_overlap.sorted.bed > /scicore/home/zavolan/GROUP/SCINPAS_catalog/human/polyAsite_Atlas_3/tmp/Atlas_v2.intersection.bed'
In [51]:
polyAsite_subset = pd.read_csv(tmp_dir+'Atlas_v2.intersection.bed',delimiter="\t",index_col=None,header=None)
In [54]:
len(polyAsite),len(polyAsite.loc[polyAsite[3].isin(list(polyAsite_subset[3].unique()))])
Out[54]:
(569005, 195574)
In [55]:
polyAsite_subset = polyAsite.loc[polyAsite[3].isin(list(polyAsite_subset[3].unique()))].reset_index(drop=True)
In [58]:
motifs_to_search = [elem.replace('U','T') for elem in motifs]
def parse_motifs(x):
if pd.isna(x[10]):
return 0
else:
l = x[10].split(';')
for motif in motifs_to_search:
for elem in l:
if elem.split('@')[0]==motif:
pos = int(elem.split('@')[1])
if pos>=-35 and pos<=-10:
return 1
return 0
polyAsite_subset['any_canonic_motif'] = polyAsite_subset.apply(lambda x: parse_motifs(x),1)
In [60]:
len(polyAsite_subset),len(polyAsite_subset.loc[polyAsite_subset['any_canonic_motif']==1])/len(polyAsite_subset)
Out[60]:
(195574, 0.5822655363187336)
In [61]:
# these are the metrics to achieve
In [69]:
# check the performance of scores as they are on this smaller subset
motif_thresholds = [0.6,0.65,0.7,0.75,0.8,0.85]
a = {}
for tissue in tissues:
tmp = df[['new_id',tissue,'any_canonic_motif']].sort_values(tissue,ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
for motif_threshold in motif_thresholds:
max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
pas_to_append = list(tmp.loc[0:max_index]['new_id'])
if a.get(motif_threshold) is not None:
a[motif_threshold] = a[motif_threshold]+pas_to_append
else:
a[motif_threshold] = pas_to_append
print(tissue+' done')
res = {}
summary = []
for motif_threshold in motif_thresholds:
tmp = pd.DataFrame(list(set(a[motif_threshold])),columns = ['new_id'])
tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
summary.append([motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)])
print(str(motif_threshold)+' done')
summary = pd.DataFrame(summary,columns = ['motif_threshold','t_PAS','motif_fraction'])
nose done trachea done heart done intestine done breast done bone done pancreas done eye done kidney done penis done ureter done lung done liver done skin done prostate done uterus done bloodImmune done brain done 0.6 done 0.65 done 0.7 done 0.75 done 0.8 done 0.85 done
In [70]:
summary.columns = ['within_tissue_motif_threshold','# PAS', 'fraction of PAS with motif after union']
In [71]:
summary
Out[71]:
| within_tissue_motif_threshold | # PAS | fraction of PAS with motif after union | |
|---|---|---|---|
| 0 | 0.60 | 162340 | 0.443606 |
| 1 | 0.65 | 119060 | 0.487443 |
| 2 | 0.70 | 87939 | 0.538828 |
| 3 | 0.75 | 63687 | 0.601300 |
| 4 | 0.80 | 44893 | 0.673557 |
| 5 | 0.85 | 30849 | 0.753606 |
In [72]:
195574/63687
Out[72]:
3.0708621853753515
In [75]:
# if not within each tissue
a = {}
tmp = df[['new_id','score','any_canonic_motif']].sort_values('score',ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
for motif_threshold in motif_thresholds:
max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
pas_to_append = list(tmp.loc[0:max_index]['new_id'])
if a.get(motif_threshold) is not None:
a[motif_threshold] = a[motif_threshold]+pas_to_append
else:
a[motif_threshold] = pas_to_append
res = {}
summary = []
for motif_threshold in motif_thresholds:
tmp = pd.DataFrame(list(set(a[motif_threshold])),columns = ['new_id'])
tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
summary.append([motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)])
print(str(motif_threshold)+' done')
summary = pd.DataFrame(summary,columns = ['motif_threshold','t_PAS','motif_fraction'])
summary
0.6 done 0.65 done 0.7 done 0.75 done 0.8 done 0.85 done
Out[75]:
| motif_threshold | t_PAS | motif_fraction | |
|---|---|---|---|
| 0 | 0.60 | 48593 | 0.600004 |
| 1 | 0.65 | 38183 | 0.650001 |
| 2 | 0.70 | 30482 | 0.700020 |
| 3 | 0.75 | 24085 | 0.750010 |
| 4 | 0.80 | 18488 | 0.800032 |
| 5 | 0.85 | 13527 | 0.850004 |
In [76]:
195574/48593
Out[76]:
4.024736073096948
In [80]:
# add quantiles by number of PAS in the gene
# df['PAS_num_cat'] = pd.qcut(df['t'],q = 500)
df['PAS_num_cat'] = df['t']
motif_threshold = 0.6
feature = 'score'
def get_filtered_PAS(L,data,feature,motif_threshold,iterator):
# 1. define expression quantiles and loop within them
expr_feature = feature+'_sum'
scoring_feature = feature+'_ratio'
data['expr_cat'] = pd.qcut(data[expr_feature],q=10,duplicates='drop')
for expr_cat in list(data['expr_cat'].unique()):
tmp = data.loc[data['expr_cat']==expr_cat].reset_index(drop=True)
tmp = tmp[['new_id',scoring_feature,'any_canonic_motif']].sort_values(scoring_feature,ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
if tmp['frac_cumul'].max()>=motif_threshold:
max_index = max(tmp.loc[tmp['frac_cumul']>=motif_threshold].index)
pas_to_append = list(tmp.loc[0:max_index]['new_id'])
L.append(pas_to_append)
if iterator%5==0:
print(str(iterator)+' done, '+str(time.time()-start_time))
start_time = time.time()
with Manager() as manager:
L = manager.list()
processes = []
i=0
for PAS_num_cat in list(df['PAS_num_cat'].unique()):
data = df.loc[df['PAS_num_cat']==PAS_num_cat].reset_index(drop=True)
p = Process(target=get_filtered_PAS, args=(L,data,feature,motif_threshold,i)) # Passing the list
p.start()
processes.append(p)
i=i+1
for p in processes:
p.join()
L = list(L)
flattened = []
for elem in L:
flattened = flattened+elem
flattened = list(set(flattened))
0 done, 0.2471604347229004 5 done, 0.6588912010192871 10 done, 1.0607943534851074 15 done, 1.4608628749847412 20 done, 1.8512182235717773 25 done, 2.256883382797241 30 done, 2.683002471923828 35 done, 3.0614254474639893 40 done, 3.464388847351074 45 done, 3.867133855819702 50 done, 4.2730114459991455 55 done, 4.648865461349487 60 done, 5.08697772026062 65 done, 5.479453802108765 70 done, 5.8689141273498535 75 done, 6.2871129512786865 80 done, 6.686724662780762 85 done, 7.074393272399902 90 done, 7.4492998123168945 95 done, 7.856955289840698 100 done, 8.238194465637207 105 done, 8.64706540107727 110 done, 9.030844926834106 115 done, 9.443076610565186 120 done, 9.844780921936035 125 done, 10.239341974258423 130 done, 10.628443717956543 135 done, 11.011773824691772 140 done, 11.42781114578247 145 done, 11.8036630153656 150 done, 12.17158031463623 155 done, 12.577524662017822 160 done, 12.946401119232178 165 done, 13.345419645309448 170 done, 13.738389492034912 175 done, 14.143964529037476 180 done, 14.518626928329468 185 done, 14.90165662765503 190 done, 15.331796884536743 195 done, 15.73563838005066 200 done, 16.135384559631348 205 done, 16.53104329109192 210 done, 16.907857656478882 215 done, 17.326249837875366 220 done, 17.73921489715576 225 done, 18.1227707862854 230 done, 18.50727081298828 235 done, 18.90893530845642 240 done, 19.289853811264038 245 done, 19.69845747947693 250 done, 20.077762365341187 255 done, 20.459183931350708 260 done, 20.863707065582275 265 done, 21.253548622131348 270 done, 21.65330195426941 275 done, 22.085458517074585 280 done, 22.451693058013916 285 done, 22.84962296485901 290 done, 23.227025032043457 295 done, 23.6243577003479 300 done, 24.011998653411865 305 done, 24.41333293914795 310 done, 24.796615839004517 315 done, 25.197178840637207 320 done, 25.599594593048096 325 done, 25.983662843704224 330 done, 26.36910319328308 335 done, 26.7863347530365 340 done, 27.162992238998413 345 done, 27.538270711898804 350 done, 27.912529230117798 355 done, 28.312389612197876 360 done, 28.67923331260681 365 done, 29.084048748016357 370 done, 29.494255542755127 375 done, 29.85740375518799 380 done, 30.232964754104614 385 done, 30.61388397216797 390 done, 30.996392488479614 395 done, 31.379972457885742 400 done, 31.761265993118286 405 done, 32.14472007751465 410 done, 32.52274680137634 415 done, 32.928714752197266 420 done, 33.302762508392334 425 done, 33.69055366516113 430 done, 34.077784061431885 435 done, 34.47099709510803 440 done, 34.842801570892334 445 done, 35.241976261138916 450 done, 35.63378548622131 455 done, 36.02464771270752 460 done, 36.43061399459839 465 done, 36.78756785392761 470 done, 37.20529890060425 475 done, 37.58163619041443 480 done, 37.98228168487549 485 done, 38.38580346107483 490 done, 38.763463497161865 495 done, 39.167964458465576 500 done, 39.57504725456238 505 done, 39.962172985076904 510 done, 40.37232208251953 515 done, 40.72806191444397 520 done, 41.1547064781189 525 done, 41.500030517578125 530 done, 41.88764786720276 535 done, 42.276387453079224 540 done, 42.66238570213318 545 done, 43.06632137298584 550 done, 43.41918349266052 555 done, 43.84536623954773 560 done, 44.23313283920288 565 done, 44.61800694465637 570 done, 44.98036813735962 575 done, 45.401899099349976 580 done, 45.763211488723755 585 done, 46.161136865615845 590 done, 46.52697801589966 595 done, 46.93542218208313 600 done, 47.320653438568115 605 done, 47.72657775878906 610 done, 48.08842325210571 615 done, 48.48499584197998 620 done, 48.886828660964966 625 done, 49.2636935710907 630 done, 49.62689685821533 635 done, 50.014978647232056 640 done, 50.40575194358826 645 done, 50.784011363983154 650 done, 51.18371343612671 655 done, 51.571752309799194 660 done, 51.93593764305115 665 done, 52.334174394607544 670 done, 52.7110710144043 675 done, 53.0884952545166 680 done, 53.50185513496399 685 done, 53.885148763656616 690 done, 54.26203751564026 695 done, 54.63493061065674 700 done, 55.023661613464355 705 done, 55.389198541641235 710 done, 55.76806855201721 715 done, 56.14607501029968 720 done, 56.551657915115356 725 done, 56.93309020996094 730 done, 57.30261588096619 735 done, 57.6970489025116 740 done, 58.11862564086914 745 done, 58.47442317008972 750 done, 58.85162162780762 755 done, 59.250535011291504 760 done, 59.627315521240234 765 done, 60.023086071014404 770 done, 60.39529895782471 775 done, 60.779624462127686 780 done, 61.16417098045349 785 done, 61.54671096801758 790 done, 61.9278347492218 795 done, 62.317710161209106 800 done, 62.712127685546875 805 done, 63.08165884017944 810 done, 63.475751876831055 815 done, 63.857280254364014 820 done, 64.22975277900696 825 done, 64.61401748657227 830 done, 65.0001471042633 835 done, 65.38176822662354 840 done, 65.77804565429688 845 done, 66.17784404754639 850 done, 66.53648948669434 855 done, 66.92903685569763 860 done, 67.31445503234863 865 done, 67.72422313690186 870 done, 68.0761365890503 875 done, 68.48452425003052 880 done, 68.87054109573364 885 done, 69.25377488136292 890 done, 69.64117527008057 895 done, 70.04185366630554 900 done, 70.4283287525177 905 done, 70.8021388053894 910 done, 71.19557476043701 915 done, 71.5963990688324 920 done, 71.977942943573 925 done, 72.37270736694336 930 done, 72.74741411209106 935 done, 73.14815425872803 940 done, 73.52707505226135 945 done, 73.90832018852234 950 done, 74.29949593544006 955 done, 74.68591737747192 960 done, 75.06726360321045 965 done, 75.45162081718445 970 done, 75.83878898620605 975 done, 76.2189781665802 980 done, 76.61057901382446 985 done, 76.9794180393219 990 done, 77.39806008338928 995 done, 77.76587128639221 1000 done, 78.14691424369812 1005 done, 78.51626181602478 1010 done, 78.9102246761322 1015 done, 79.28442621231079 1020 done, 79.65670680999756 1025 done, 80.05499053001404 1030 done, 80.42851424217224 1035 done, 80.81850171089172 1040 done, 81.19444751739502 1045 done, 81.5593330860138 1050 done, 81.95001459121704 1055 done, 82.3365957736969 1060 done, 82.72114825248718 1065 done, 83.09857678413391 1070 done, 83.4881739616394 1075 done, 83.86461305618286 1080 done, 84.24799990653992 1085 done, 84.62825465202332 1090 done, 85.01897430419922 1095 done, 85.38664197921753 1100 done, 85.78169631958008 1105 done, 86.15666961669922 1110 done, 86.5611732006073 1115 done, 86.94381070137024 1120 done, 87.34171795845032 1125 done, 87.71135997772217 1130 done, 88.10237002372742 1135 done, 88.49527525901794 1140 done, 88.88247156143188 1145 done, 89.26415753364563 1150 done, 89.64172196388245 1155 done, 90.02797794342041 1160 done, 90.4079098701477 1165 done, 90.78547263145447 1170 done, 91.17601299285889 1175 done, 91.55146670341492 1180 done, 91.93451476097107 1185 done, 92.32014036178589 1190 done, 92.69587135314941 1195 done, 93.08174061775208 1200 done, 93.46492028236389 1205 done, 93.8482985496521 1210 done, 94.23295974731445 1215 done, 94.60936951637268 1220 done, 95.0038959980011 1225 done, 95.38508462905884 1230 done, 95.76945805549622 1235 done, 96.15371298789978 1240 done, 96.52788472175598 1245 done, 96.90053844451904 1250 done, 97.27894330024719 1255 done, 97.66366076469421 1260 done, 98.04296684265137 1265 done, 98.43003749847412 1270 done, 98.81429862976074 1275 done, 99.20108771324158 1280 done, 99.57811784744263 1285 done, 99.96428442001343 1290 done, 100.33957052230835 1295 done, 100.72369456291199 1300 done, 101.09698104858398 1305 done, 101.47606587409973 1310 done, 101.85790014266968 1315 done, 102.24446177482605 1320 done, 102.63164377212524 1325 done, 103.01276230812073 1330 done, 103.40927982330322 1335 done, 103.80992794036865 1340 done, 104.20266771316528 1345 done, 104.57926177978516 1350 done, 104.98246169090271 1355 done, 105.36634707450867 1360 done, 105.74168968200684 1365 done, 106.12457513809204 1370 done, 106.51714992523193 1375 done, 106.89436912536621 1380 done, 107.27085256576538 1385 done, 107.66155433654785 1390 done, 108.028315782547 1395 done, 108.41190671920776 1400 done, 108.80394220352173 1405 done, 109.17514181137085 1410 done, 109.56265091896057 1415 done, 109.95081114768982 1420 done, 110.33561420440674 1425 done, 110.71273994445801 1430 done, 111.09171772003174 1435 done, 111.47609615325928 1440 done, 111.85429739952087 1445 done, 112.23667860031128 1450 done, 112.61955165863037 1455 done, 112.99793148040771 1460 done, 113.38851833343506 1465 done, 113.76855301856995 1470 done, 114.1660807132721 1475 done, 114.5475754737854 1480 done, 114.92905402183533 1485 done, 115.31338858604431 1490 done, 115.68027997016907 1495 done, 116.06084060668945 1500 done, 116.44997668266296 1505 done, 116.83036518096924 1510 done, 117.21098399162292 1515 done, 117.59425711631775 1520 done, 117.96703767776489 1525 done, 118.33863472938538 1530 done, 118.72356271743774 1535 done, 119.1332643032074 1540 done, 119.48778557777405 1545 done, 119.86788535118103 1550 done, 120.24672412872314 1555 done, 120.6217429637909 1560 done, 120.99695110321045 1565 done, 121.37977004051208 1570 done, 121.77221918106079 1575 done, 122.17509388923645 1580 done, 122.55719995498657 1585 done, 122.93667984008789 1590 done, 123.35222458839417 1595 done, 123.74211883544922 1600 done, 124.12389636039734 1605 done, 124.5122582912445 1610 done, 124.8997015953064 1615 done, 125.28213596343994 1620 done, 125.68218231201172 1625 done, 126.08383679389954 1630 done, 126.47371482849121 1635 done, 126.84444046020508 1640 done, 127.23132658004761 1645 done, 127.61368918418884 1650 done, 127.99730610847473 1655 done, 128.3777027130127 1660 done, 128.75566053390503 1665 done, 129.14171528816223 1670 done, 129.5226104259491 1675 done, 129.9076681137085 1680 done, 130.2842676639557 1685 done, 130.67197632789612 1690 done, 131.0692653656006 1695 done, 131.43784141540527 1700 done, 131.80851578712463 1705 done, 132.19280290603638 1710 done, 132.5805115699768 1715 done, 132.9616162776947 1720 done, 133.34007048606873 1725 done, 133.7207226753235 1730 done, 134.11619019508362 1735 done, 134.49426531791687 1740 done, 134.90068554878235 1745 done, 135.2692232131958 1750 done, 135.65832996368408 1755 done, 136.03240180015564 1760 done, 136.41043710708618 1765 done, 136.80464482307434 1770 done, 137.2087528705597 1775 done, 137.58966326713562 1780 done, 137.96461987495422 1785 done, 138.34827375411987 1790 done, 138.72814297676086 1795 done, 139.11354804039001 1800 done, 139.5034441947937 1805 done, 139.89383268356323 1810 done, 140.2939121723175 1815 done, 140.6767635345459 1820 done, 141.06733393669128 1825 done, 141.45396995544434
In [81]:
len(flattened)
Out[81]:
86567
In [82]:
tmp = pd.DataFrame(flattened,columns = ['new_id'])
tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)
Out[82]:
(0.6, 86567, 0.6127161620478936)
In [90]:
len(df.loc[df['t']<4])/len(df)
Out[90]:
0.0019937976516339003
In [92]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))
ax = sns.histplot(df['t'],stat='density')
ax.set(xlabel = '# PAS in the gene')
Out[92]:
[Text(0.5, 0, '# PAS in the gene')]
In [ ]:
In [84]:
# let's try within each tissue
# add quantiles by number of PAS in the gene
# df['PAS_num_cat'] = pd.qcut(df['t'],q = 100)
df['PAS_num_cat'] = df['t']
motif_threshold = 0.72
def get_filtered_PAS(L,data,feature,motif_threshold,iterator_i,iterator_j):
# 1. define expression quantiles and loop within them
expr_feature = feature+'_sum'
scoring_feature = feature+'_ratio'
data['expr_cat'] = pd.qcut(data[expr_feature],q=10,duplicates='drop')
for expr_cat in list(data['expr_cat'].unique()):
tmp = data.loc[data['expr_cat']==expr_cat].reset_index(drop=True)
tmp = tmp[['new_id',scoring_feature,'any_canonic_motif']].sort_values(scoring_feature,ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
if tmp['frac_cumul'].max()>=motif_threshold:
max_index = max(tmp.loc[tmp['frac_cumul']>=motif_threshold].index)
pas_to_append = list(tmp.loc[0:max_index]['new_id'])
L.append(pas_to_append)
if iterator_i%5==0 and iterator_j%5==0:
print(str(iterator_i)+','+str(iterator_j)+' done, '+str(time.time()-start_time))
start_time = time.time()
with Manager() as manager:
L = manager.list()
processes = []
i=0
for PAS_num_cat in list(df['PAS_num_cat'].unique()):
data = df.loc[df['PAS_num_cat']==PAS_num_cat].reset_index(drop=True)
j=0
for feature in tissues:
p = Process(target=get_filtered_PAS, args=(L,data,feature,motif_threshold,i,j)) # Passing the list
p.start()
processes.append(p)
j=j+1
i=i+1
for p in processes:
p.join()
L = list(L)
flattened = []
for elem in L:
flattened = flattened+elem
flattened = list(set(flattened))
0,0 done, 0.24164724349975586 0,5 done, 0.5212569236755371 0,10 done, 0.81752610206604 0,15 done, 1.0853595733642578 5,0 done, 5.4705047607421875 5,5 done, 5.761173963546753 5,10 done, 6.051577568054199 5,15 done, 6.333654165267944 10,0 done, 10.630130052566528 10,5 done, 10.926160097122192 10,10 done, 11.209879159927368 10,15 done, 11.477243185043335 15,0 done, 15.771695137023926 15,5 done, 16.049487352371216 15,10 done, 16.328447580337524 15,15 done, 16.602094173431396 20,0 done, 20.91411256790161 20,5 done, 21.203969955444336 20,10 done, 21.489899158477783 20,15 done, 21.76624894142151 25,0 done, 26.057672262191772 25,5 done, 26.322872400283813 25,10 done, 26.613312482833862 25,15 done, 26.892600297927856 30,0 done, 31.18018126487732 30,5 done, 31.467642307281494 30,10 done, 31.741339206695557 30,15 done, 32.03318119049072 35,0 done, 36.32417607307434 35,5 done, 36.602927923202515 35,10 done, 36.88266682624817 35,15 done, 37.15081572532654 40,0 done, 41.440335273742676 40,5 done, 41.73688077926636 40,10 done, 42.00178623199463 40,15 done, 42.289522647857666 45,0 done, 46.640711545944214 45,5 done, 46.92738127708435 45,10 done, 47.2142596244812 45,15 done, 47.48786950111389 50,0 done, 51.81929898262024 50,5 done, 52.08564281463623 50,10 done, 52.373751401901245 50,15 done, 52.652676820755005 55,0 done, 56.91909384727478 55,5 done, 57.198766231536865 55,10 done, 57.471991300582886 55,15 done, 57.749422788619995 60,0 done, 62.03436732292175 60,5 done, 62.318875789642334 60,10 done, 62.57627630233765 60,15 done, 62.85427713394165 65,0 done, 67.16364550590515 65,5 done, 67.45202422142029 65,10 done, 67.71409487724304 65,15 done, 67.99264407157898 70,0 done, 72.27025294303894 70,5 done, 72.56512904167175 70,10 done, 72.84230852127075 70,15 done, 73.12736582756042 75,0 done, 77.39294958114624 75,5 done, 77.67104125022888 75,10 done, 77.94792890548706 75,15 done, 78.22971296310425 80,0 done, 82.5317018032074 80,5 done, 82.80428385734558 80,10 done, 83.08894920349121 80,15 done, 83.39330959320068 85,0 done, 87.64873123168945 85,5 done, 87.93561434745789 85,10 done, 88.21007251739502 85,15 done, 88.48969030380249 90,0 done, 92.78563284873962 90,5 done, 93.08458256721497 90,10 done, 93.33713006973267 90,15 done, 93.61748552322388 95,0 done, 98.00365352630615 95,5 done, 98.26403546333313 95,10 done, 98.54574513435364 95,15 done, 98.84749341011047 100,0 done, 103.11708068847656 100,5 done, 103.39178586006165 100,10 done, 103.67576503753662 100,15 done, 103.96727585792542 105,0 done, 108.26972770690918 105,5 done, 108.56708693504333 105,10 done, 108.83837223052979 105,15 done, 109.10081624984741 110,0 done, 113.41192603111267 110,5 done, 113.70247960090637 110,10 done, 113.98627209663391 110,15 done, 114.27441096305847 115,0 done, 118.64116287231445 115,5 done, 118.93541550636292 115,10 done, 119.1908106803894 115,15 done, 119.45669746398926 120,0 done, 123.63670516014099 120,5 done, 123.8972225189209 120,10 done, 124.18350720405579 120,15 done, 124.43578362464905 125,0 done, 128.60830187797546 125,5 done, 128.87898921966553 125,10 done, 129.1560435295105 125,15 done, 129.40236401557922 130,0 done, 133.5451786518097 130,5 done, 133.82622599601746 130,10 done, 134.08655524253845 130,15 done, 134.36100339889526 135,0 done, 138.4590892791748 135,5 done, 138.75429821014404 135,10 done, 139.02001309394836 135,15 done, 139.29079627990723 140,0 done, 143.39318323135376 140,5 done, 143.69177794456482 140,10 done, 143.96799731254578 140,15 done, 144.23493552207947 145,0 done, 148.43969774246216 145,5 done, 148.71066999435425 145,10 done, 148.9861717224121 145,15 done, 149.24526119232178 150,0 done, 153.36662650108337 150,5 done, 153.64371848106384 150,10 done, 153.91676712036133 150,15 done, 154.18864917755127 155,0 done, 158.43565392494202 155,5 done, 158.70899367332458 155,10 done, 158.97197914123535 155,15 done, 159.2377965450287 160,0 done, 163.3858962059021 160,5 done, 163.65737318992615 160,10 done, 163.93180561065674 160,15 done, 164.21466994285583 165,0 done, 168.39101266860962 165,5 done, 168.6672146320343 165,10 done, 168.9538815021515 165,15 done, 169.22979140281677 170,0 done, 173.37977027893066 170,5 done, 173.6368486881256 170,10 done, 173.92251873016357 170,15 done, 174.19756603240967 175,0 done, 178.31804513931274 175,5 done, 178.60632276535034 175,10 done, 178.87167501449585 175,15 done, 179.1398069858551 180,0 done, 183.30749940872192 180,5 done, 183.58507657051086 180,10 done, 183.8646092414856 180,15 done, 184.13516879081726 185,0 done, 188.29443383216858 185,5 done, 188.56792187690735 185,10 done, 188.84500741958618 185,15 done, 189.12130451202393 190,0 done, 193.30440855026245 190,5 done, 193.59757494926453 190,10 done, 193.8599410057068 190,15 done, 194.11533164978027 195,0 done, 198.30212664604187 195,5 done, 198.55965900421143 195,10 done, 198.82527256011963 195,15 done, 199.0986783504486 200,0 done, 203.30581951141357 200,5 done, 203.5724618434906 200,10 done, 203.85144686698914 200,15 done, 204.1353521347046 205,0 done, 208.303231716156 205,5 done, 208.55742740631104 205,10 done, 208.84722137451172 205,15 done, 209.10077214241028 210,0 done, 213.27168798446655 210,5 done, 213.55215764045715 210,10 done, 213.8308389186859 210,15 done, 214.09841299057007 215,0 done, 218.29819059371948 215,5 done, 218.55914211273193 215,10 done, 218.83475255966187 215,15 done, 219.1120729446411 220,0 done, 223.3249795436859 220,5 done, 223.6068286895752 220,10 done, 223.87664270401 220,15 done, 224.1399097442627 225,0 done, 228.36349821090698 225,5 done, 228.63332986831665 225,10 done, 228.89439010620117 225,15 done, 229.25585460662842 230,0 done, 233.55262327194214 230,5 done, 233.8238444328308 230,10 done, 234.1181252002716 230,15 done, 234.37919402122498 235,0 done, 238.5248007774353 235,5 done, 238.79728317260742 235,10 done, 239.06859064102173 235,15 done, 239.35625910758972 240,0 done, 243.51157665252686 240,5 done, 243.7708761692047 240,10 done, 244.04712796211243 240,15 done, 244.32174253463745 245,0 done, 248.5042643547058 245,5 done, 248.76883506774902 245,10 done, 249.03928089141846 245,15 done, 249.32917380332947 250,0 done, 253.5294153690338 250,5 done, 253.80415177345276 250,10 done, 254.06284022331238 250,15 done, 254.35352611541748 255,0 done, 258.5487151145935 255,5 done, 258.82013034820557 255,10 done, 259.09327578544617 255,15 done, 259.3798871040344 260,0 done, 263.5269412994385 260,5 done, 263.797399520874 260,10 done, 264.0839650630951 260,15 done, 264.3481044769287 265,0 done, 268.5343379974365 265,5 done, 268.8143243789673 265,10 done, 269.0817859172821 265,15 done, 269.3701205253601 270,0 done, 273.5129625797272 270,5 done, 273.7865467071533 270,10 done, 274.06920075416565 270,15 done, 274.35462403297424 275,0 done, 278.50887989997864 275,5 done, 278.79722023010254 275,10 done, 279.07101917266846 275,15 done, 279.3437204360962 280,0 done, 283.5157153606415 280,5 done, 283.80007004737854 280,10 done, 284.0703332424164 280,15 done, 284.34508299827576 285,0 done, 288.5400755405426 285,5 done, 288.82456374168396 285,10 done, 289.09572172164917 285,15 done, 289.3652627468109 290,0 done, 293.4575197696686 290,5 done, 293.73136162757874 290,10 done, 294.00155901908875 290,15 done, 294.2786808013916 295,0 done, 298.4423358440399 295,5 done, 298.7234516143799 295,10 done, 298.98983240127563 295,15 done, 299.2665162086487 300,0 done, 303.42761850357056 300,5 done, 303.6991219520569 300,10 done, 303.9754412174225 300,15 done, 304.2584960460663 305,0 done, 308.42584013938904 305,5 done, 308.70153164863586 305,10 done, 308.976345539093 305,15 done, 309.25114154815674 310,0 done, 313.4364240169525 310,5 done, 313.7071797847748 310,10 done, 313.9830791950226 310,15 done, 314.27019119262695 315,0 done, 318.4357695579529 315,5 done, 318.71538734436035 315,10 done, 318.9992172718048 315,15 done, 319.2796595096588 320,0 done, 323.44446897506714 320,5 done, 323.7331793308258 320,10 done, 323.9742908477783 320,15 done, 324.2472264766693 325,0 done, 328.39719438552856 325,5 done, 328.6690971851349 325,10 done, 328.94412422180176 325,15 done, 329.2228088378906 330,0 done, 333.3870105743408 330,5 done, 333.6663155555725 330,10 done, 333.93312907218933 330,15 done, 334.20839262008667 335,0 done, 338.39019799232483 335,5 done, 338.6809298992157 335,10 done, 338.9510474205017 335,15 done, 339.2087388038635 340,0 done, 343.4102487564087 340,5 done, 343.68486523628235 340,10 done, 343.965167760849 340,15 done, 344.2312169075012 345,0 done, 348.472149848938 345,5 done, 348.7231192588806 345,10 done, 349.012327671051 345,15 done, 349.28609442710876 350,0 done, 353.46914744377136 350,5 done, 353.74036693573 350,10 done, 354.01755380630493 350,15 done, 354.2918860912323 355,0 done, 358.5105490684509 355,5 done, 358.76985931396484 355,10 done, 359.04974389076233 355,15 done, 359.3324444293976 360,0 done, 363.5048487186432 360,5 done, 363.7737581729889 360,10 done, 364.0433859825134 360,15 done, 364.3175559043884 365,0 done, 368.5716965198517 365,5 done, 368.843279838562 365,10 done, 369.1293594837189 365,15 done, 369.4007685184479 370,0 done, 373.6319532394409 370,5 done, 373.8940975666046 370,10 done, 374.17779183387756 370,15 done, 374.44702672958374 375,0 done, 378.6593904495239 375,5 done, 378.9323399066925 375,10 done, 379.2021381855011 375,15 done, 379.48533844947815 380,0 done, 383.6702415943146 380,5 done, 383.94545817375183 380,10 done, 384.22287130355835 380,15 done, 384.4988696575165 385,0 done, 388.69513869285583 385,5 done, 388.96689343452454 385,10 done, 389.2469696998596 385,15 done, 389.51947593688965 390,0 done, 393.7057538032532 390,5 done, 393.96474170684814 390,10 done, 394.2424404621124 390,15 done, 394.53152203559875 395,0 done, 398.75367045402527 395,5 done, 399.0276517868042 395,10 done, 399.27933835983276 395,15 done, 399.5656735897064 400,0 done, 403.77329754829407 400,5 done, 404.04234313964844 400,10 done, 404.3308811187744 400,15 done, 404.601763010025 405,0 done, 408.78511667251587 405,5 done, 409.06988739967346 405,10 done, 409.3475093841553 405,15 done, 409.6308686733246 410,0 done, 413.8238501548767 410,5 done, 414.10115122795105 410,10 done, 414.3725814819336 410,15 done, 414.6550180912018 415,0 done, 418.82885575294495 415,5 done, 419.08973932266235 415,10 done, 419.35843229293823 415,15 done, 419.6428337097168 420,0 done, 423.8292908668518 420,5 done, 424.1023871898651 420,10 done, 424.382611989975 420,15 done, 424.66775131225586 425,0 done, 428.86771273612976 425,5 done, 429.1471335887909 425,10 done, 429.422523021698 425,15 done, 429.7044162750244 430,0 done, 433.91064167022705 430,5 done, 434.1735727787018 430,10 done, 434.46255826950073 430,15 done, 434.739470243454 435,0 done, 438.8892867565155 435,5 done, 439.16989159584045 435,10 done, 439.4443418979645 435,15 done, 439.7211456298828 440,0 done, 443.89678859710693 440,5 done, 444.17357993125916 440,10 done, 444.4485948085785 440,15 done, 444.72360610961914 445,0 done, 448.9459762573242 445,5 done, 449.22178292274475 445,10 done, 449.5010802745819 445,15 done, 449.7735559940338 450,0 done, 454.02100229263306 450,5 done, 454.2927129268646 450,10 done, 454.5714707374573 450,15 done, 454.8255846500397 455,0 done, 459.09684801101685 455,5 done, 459.3705313205719 455,10 done, 459.6519458293915 455,15 done, 459.92254996299744 460,0 done, 464.0831913948059 460,5 done, 464.3493025302887 460,10 done, 464.64526414871216 460,15 done, 464.90820121765137 465,0 done, 469.03287625312805 465,5 done, 469.3082573413849 465,10 done, 469.58028745651245 465,15 done, 469.8574597835541 470,0 done, 474.10878252983093 470,5 done, 474.3696496486664 470,10 done, 474.63843274116516 470,15 done, 474.9300651550293 475,0 done, 479.0802249908447 475,5 done, 479.3631842136383 475,10 done, 479.628892660141 475,15 done, 479.91116189956665 480,0 done, 484.05895352363586 480,5 done, 484.33325934410095 480,10 done, 484.6120719909668 480,15 done, 484.8843674659729 485,0 done, 489.08441138267517 485,5 done, 489.36877155303955 485,10 done, 489.64523363113403 485,15 done, 489.9195625782013 490,0 done, 494.0961422920227 490,5 done, 494.3709843158722 490,10 done, 494.65176463127136 490,15 done, 494.91996145248413 495,0 done, 499.2552535533905 495,5 done, 499.54716444015503 495,10 done, 499.8217990398407 495,15 done, 500.10638642311096 500,0 done, 504.4313666820526 500,5 done, 504.72011852264404 500,10 done, 504.9935176372528 500,15 done, 505.280668258667 505,0 done, 509.5926010608673 505,5 done, 509.8793866634369 505,10 done, 510.13716340065 505,15 done, 510.43706250190735 510,0 done, 514.7864050865173 510,5 done, 515.0718212127686 510,10 done, 515.354480266571 510,15 done, 515.6221823692322 515,0 done, 519.8753478527069 515,5 done, 520.1561980247498 515,10 done, 520.4393167495728 515,15 done, 520.7242593765259 520,0 done, 525.0359773635864 520,5 done, 525.312933921814 520,10 done, 525.5941216945648 520,15 done, 525.8661017417908 525,0 done, 530.218088388443 525,5 done, 530.5031416416168 525,10 done, 530.7850489616394 525,15 done, 531.0679726600647 530,0 done, 535.4243264198303 530,5 done, 535.7200374603271 530,10 done, 535.99968957901 530,15 done, 536.2847936153412 535,0 done, 540.6449003219604 535,5 done, 540.9155657291412 535,10 done, 541.209760427475 535,15 done, 541.4948670864105 540,0 done, 545.8411958217621 540,5 done, 546.121990442276 540,10 done, 546.4087655544281 540,15 done, 546.6889855861664 545,0 done, 551.0360252857208 545,5 done, 551.3018114566803 545,10 done, 551.5975475311279 545,15 done, 551.8622314929962 550,0 done, 556.1928405761719 550,5 done, 556.4822909832001 550,10 done, 556.7638664245605 550,15 done, 557.0472047328949 555,0 done, 561.4294264316559 555,5 done, 561.7187008857727 555,10 done, 561.9917771816254 555,15 done, 562.2758059501648 560,0 done, 566.6590075492859 560,5 done, 566.9476583003998 560,10 done, 567.2245280742645 560,15 done, 567.5043573379517 565,0 done, 571.7852427959442 565,5 done, 572.0702300071716 565,10 done, 572.3465735912323 565,15 done, 572.6115500926971 570,0 done, 576.8374016284943 570,5 done, 577.1153447628021 570,10 done, 577.3969025611877 570,15 done, 577.6735191345215 575,0 done, 581.9362254142761 575,5 done, 582.2178158760071 575,10 done, 582.4650197029114 575,15 done, 582.7390463352203 580,0 done, 586.952977180481 580,5 done, 587.2221784591675 580,10 done, 587.5044434070587 580,15 done, 587.7847306728363 585,0 done, 591.9671263694763 585,5 done, 592.2525153160095 585,10 done, 592.5224878787994 585,15 done, 592.8013548851013 590,0 done, 596.9636476039886 590,5 done, 597.2385721206665 590,10 done, 597.5195469856262 590,15 done, 597.7908568382263 595,0 done, 602.0456335544586 595,5 done, 602.3154339790344 595,10 done, 602.5944061279297 595,15 done, 602.8628342151642 600,0 done, 607.0339159965515 600,5 done, 607.2962026596069 600,10 done, 607.5683290958405 600,15 done, 607.8636562824249 605,0 done, 612.0601444244385 605,5 done, 612.3437383174896 605,10 done, 612.6189403533936 605,15 done, 612.8660061359406 610,0 done, 617.0690739154816 610,5 done, 617.3497817516327 610,10 done, 617.6225843429565 610,15 done, 617.9030134677887 615,0 done, 622.0675826072693 615,5 done, 622.3402104377747 615,10 done, 622.6234769821167 615,15 done, 622.9033229351044 620,0 done, 627.129613161087 620,5 done, 627.4062819480896 620,10 done, 627.6558187007904 620,15 done, 627.9268844127655 625,0 done, 632.124080657959 625,5 done, 632.4104740619659 625,10 done, 632.6695499420166 625,15 done, 632.9564731121063 630,0 done, 637.1549327373505 630,5 done, 637.4316415786743 630,10 done, 637.7098526954651 630,15 done, 637.9892203807831 635,0 done, 642.2241199016571 635,5 done, 642.5014252662659 635,10 done, 642.7796883583069 635,15 done, 643.0554611682892 640,0 done, 647.3258881568909 640,5 done, 647.6019639968872 640,10 done, 647.8796949386597 640,15 done, 648.1600773334503 645,0 done, 652.3728315830231 645,5 done, 652.6490709781647 645,10 done, 652.9247798919678 645,15 done, 653.2050352096558 650,0 done, 657.4679977893829 650,5 done, 657.7156465053558 650,10 done, 658.0092985630035 650,15 done, 658.2637267112732 655,0 done, 662.5754718780518 655,5 done, 662.8478174209595 655,10 done, 663.1296441555023 655,15 done, 663.4150671958923 660,0 done, 667.7552168369293 660,5 done, 668.0531363487244 660,10 done, 668.3349828720093 660,15 done, 668.6215255260468 665,0 done, 672.988890171051 665,5 done, 673.2644264698029 665,10 done, 673.5445265769958 665,15 done, 673.8222689628601 670,0 done, 678.2126438617706 670,5 done, 678.4963080883026 670,10 done, 678.7728431224823 670,15 done, 679.0586297512054 675,0 done, 683.3918299674988 675,5 done, 683.6764545440674 675,10 done, 683.963515996933 675,15 done, 684.2404139041901 680,0 done, 688.5895857810974 680,5 done, 688.8738136291504 680,10 done, 689.1588642597198 680,15 done, 689.4532177448273 685,0 done, 693.7912862300873 685,5 done, 694.0671660900116 685,10 done, 694.3562915325165 685,15 done, 694.6468744277954 690,0 done, 698.9622225761414 690,5 done, 699.2441654205322 690,10 done, 699.5309822559357 690,15 done, 699.8142523765564 695,0 done, 704.176023721695 695,5 done, 704.4582962989807 695,10 done, 704.7455811500549 695,15 done, 705.0278754234314 700,0 done, 709.4494717121124 700,5 done, 709.735095500946 700,10 done, 710.0313882827759 700,15 done, 710.3094072341919 705,0 done, 714.6528625488281 705,5 done, 714.9334189891815 705,10 done, 715.2172918319702 705,15 done, 715.5013883113861 710,0 done, 719.897864818573 710,5 done, 720.1748478412628 710,10 done, 720.45676445961 710,15 done, 720.7433660030365 715,0 done, 725.0745985507965 715,5 done, 725.3476865291595 715,10 done, 725.6350479125977 715,15 done, 725.9191451072693 720,0 done, 730.1243464946747 720,5 done, 730.4036183357239 720,10 done, 730.6682891845703 720,15 done, 730.9664950370789 725,0 done, 735.1906020641327 725,5 done, 735.4627411365509 725,10 done, 735.7323019504547 725,15 done, 735.9839425086975 730,0 done, 740.2310519218445 730,5 done, 740.5022397041321 730,10 done, 740.7721447944641 730,15 done, 741.0643968582153 735,0 done, 745.3197801113129 735,5 done, 745.5991966724396 735,10 done, 745.8790557384491 735,15 done, 746.1433944702148 740,0 done, 750.393824338913 740,5 done, 750.6789584159851 740,10 done, 750.9453184604645 740,15 done, 751.1969521045685 745,0 done, 755.4414467811584 745,5 done, 755.7089140415192 745,10 done, 756.0033898353577 745,15 done, 756.2723898887634 750,0 done, 760.5033583641052 750,5 done, 760.7806828022003 750,10 done, 761.0586125850677 750,15 done, 761.3395943641663 755,0 done, 765.5693111419678 755,5 done, 765.8607499599457 755,10 done, 766.1120779514313 755,15 done, 766.412159204483 760,0 done, 770.5829894542694 760,5 done, 770.8523952960968 760,10 done, 771.1361811161041 760,15 done, 771.4068326950073 765,0 done, 775.6807835102081 765,5 done, 775.9448416233063 765,10 done, 776.2266187667847 765,15 done, 776.4898588657379 770,0 done, 780.7135100364685 770,5 done, 780.9960253238678 770,10 done, 781.2676248550415 770,15 done, 781.540034532547 775,0 done, 785.7953722476959 775,5 done, 786.0735812187195 775,10 done, 786.3476424217224 775,15 done, 786.6272139549255 780,0 done, 790.893114566803 780,5 done, 791.1702108383179 780,10 done, 791.4534525871277 780,15 done, 791.7239575386047 785,0 done, 795.9631559848785 785,5 done, 796.2398178577423 785,10 done, 796.5130569934845 785,15 done, 796.7889997959137 790,0 done, 801.0111815929413 790,5 done, 801.2948307991028 790,10 done, 801.581146478653 790,15 done, 801.8617186546326 795,0 done, 806.1055793762207 795,5 done, 806.3847088813782 795,10 done, 806.6642324924469 795,15 done, 806.9478843212128 800,0 done, 811.2108924388885 800,5 done, 811.467043876648 800,10 done, 811.7577788829803 800,15 done, 812.01353764534 805,0 done, 816.2692368030548 805,5 done, 816.5391526222229 805,10 done, 816.8168656826019 805,15 done, 817.094530582428 810,0 done, 821.4771661758423 810,5 done, 821.751677274704 810,10 done, 822.0233829021454 810,15 done, 822.2994844913483 815,0 done, 826.4803774356842 815,5 done, 826.7601511478424 815,10 done, 827.042578458786 815,15 done, 827.3095343112946 820,0 done, 831.6213455200195 820,5 done, 831.9081859588623 820,10 done, 832.199898481369 820,15 done, 832.4849102497101 825,0 done, 836.8639736175537 825,5 done, 837.1501755714417 825,10 done, 837.4386613368988 825,15 done, 837.7188942432404 830,0 done, 842.2067861557007 830,5 done, 842.4897561073303 830,10 done, 842.776419878006 830,15 done, 843.0630719661713 835,0 done, 847.3844666481018 835,5 done, 847.6636922359467 835,10 done, 847.9421803951263 835,15 done, 848.2229390144348 840,0 done, 852.6050081253052 840,5 done, 852.8891174793243 840,10 done, 853.17134308815 840,15 done, 853.4583516120911 845,0 done, 857.8573467731476 845,5 done, 858.1489543914795 845,10 done, 858.4046568870544 845,15 done, 858.6872222423553 850,0 done, 862.9923207759857 850,5 done, 863.2741010189056 850,10 done, 863.5625824928284 850,15 done, 863.8496580123901 855,0 done, 868.2418196201324 855,5 done, 868.5303859710693 855,10 done, 868.8160524368286 855,15 done, 869.102077960968 860,0 done, 873.4994742870331 860,5 done, 873.7883849143982 860,10 done, 874.0711283683777 860,15 done, 874.3665254116058 865,0 done, 878.7859773635864 865,5 done, 879.0804629325867 865,10 done, 879.3377728462219 865,15 done, 879.6212348937988 870,0 done, 884.0143649578094 870,5 done, 884.2982897758484 870,10 done, 884.5915343761444 870,15 done, 884.8753366470337 875,0 done, 889.2721331119537 875,5 done, 889.5469808578491 875,10 done, 889.83891248703 875,15 done, 890.1276779174805 880,0 done, 894.5073704719543 880,5 done, 894.7919833660126 880,10 done, 895.0795361995697 880,15 done, 895.3605134487152 885,0 done, 899.6969661712646 885,5 done, 899.9802167415619 885,10 done, 900.2674098014832 885,15 done, 900.5604457855225 890,0 done, 904.9466788768768 890,5 done, 905.2319974899292 890,10 done, 905.5117061138153 890,15 done, 905.7945983409882 895,0 done, 910.1624882221222 895,5 done, 910.4474611282349 895,10 done, 910.7289435863495 895,15 done, 911.0159175395966 900,0 done, 915.3851385116577 900,5 done, 915.6591608524323 900,10 done, 915.9490976333618 900,15 done, 916.2410159111023 905,0 done, 920.6494598388672 905,5 done, 920.9370155334473 905,10 done, 921.2071959972382 905,15 done, 921.486394405365 910,0 done, 925.9617817401886 910,5 done, 926.2441463470459 910,10 done, 926.5272858142853 910,15 done, 926.8141684532166 915,0 done, 931.2210638523102 915,5 done, 931.5012364387512 915,10 done, 931.7902836799622 915,15 done, 932.0779583454132 920,0 done, 936.4148895740509 920,5 done, 936.7008972167969 920,10 done, 936.9833786487579 920,15 done, 937.2718968391418 925,0 done, 941.666154384613 925,5 done, 941.9432063102722 925,10 done, 942.2414374351501 925,15 done, 942.5181872844696 930,0 done, 946.9199805259705 930,5 done, 947.2022395133972 930,10 done, 947.4846925735474 930,15 done, 947.7730929851532 935,0 done, 952.1855492591858 935,5 done, 952.4641237258911 935,10 done, 952.7553219795227 935,15 done, 953.0346581935883 940,0 done, 957.2769856452942 940,5 done, 957.5664367675781 940,10 done, 957.8540024757385 940,15 done, 958.1457755565643 945,0 done, 962.5507564544678 945,5 done, 962.8432266712189 945,10 done, 963.1270995140076 945,15 done, 963.4140055179596 950,0 done, 967.8107125759125 950,5 done, 968.1003966331482 950,10 done, 968.3898718357086 950,15 done, 968.6730215549469 955,0 done, 973.0658066272736 955,5 done, 973.3453478813171 955,10 done, 973.6333844661713 955,15 done, 973.916127204895 960,0 done, 978.2917811870575 960,5 done, 978.5568854808807 960,10 done, 978.8455209732056 960,15 done, 979.1360175609589 965,0 done, 983.5020980834961 965,5 done, 983.7733964920044 965,10 done, 984.0641558170319 965,15 done, 984.3492908477783 970,0 done, 988.7335839271545 970,5 done, 989.0228207111359 970,10 done, 989.3108632564545 970,15 done, 989.5975909233093 975,0 done, 993.9689362049103 975,5 done, 994.25949883461 975,10 done, 994.5435929298401 975,15 done, 994.8284201622009 980,0 done, 999.2368927001953 980,5 done, 999.52472448349 980,10 done, 999.8201823234558 980,15 done, 1000.1041505336761 985,0 done, 1004.4944958686829 985,5 done, 1004.7743382453918 985,10 done, 1005.0640025138855 985,15 done, 1005.3557107448578 990,0 done, 1009.8037850856781 990,5 done, 1010.087170124054 990,10 done, 1010.3725650310516 990,15 done, 1010.6510910987854 995,0 done, 1015.0388147830963 995,5 done, 1015.3268163204193 995,10 done, 1015.6026592254639 995,15 done, 1015.8919188976288 1000,0 done, 1020.292563199997 1000,5 done, 1020.5696680545807 1000,10 done, 1020.8624000549316 1000,15 done, 1021.1517825126648 1005,0 done, 1025.5269122123718 1005,5 done, 1025.8058607578278 1005,10 done, 1026.0920014381409 1005,15 done, 1026.380128145218 1010,0 done, 1030.7932331562042 1010,5 done, 1031.076064825058 1010,10 done, 1031.3664486408234 1010,15 done, 1031.6582448482513 1015,0 done, 1036.016799211502 1015,5 done, 1036.2971568107605 1015,10 done, 1036.5800948143005 1015,15 done, 1036.8655347824097 1020,0 done, 1041.2482006549835 1020,5 done, 1041.5398230552673 1020,10 done, 1041.8276278972626 1020,15 done, 1042.1181259155273 1025,0 done, 1046.4189794063568 1025,5 done, 1046.6972961425781 1025,10 done, 1046.9841032028198 1025,15 done, 1047.2734334468842 1030,0 done, 1051.6837584972382 1030,5 done, 1051.9602708816528 1030,10 done, 1052.2345156669617 1030,15 done, 1052.5315897464752 1035,0 done, 1056.8803253173828 1035,5 done, 1057.1647400856018 1035,10 done, 1057.4587049484253 1035,15 done, 1057.7395386695862 1040,0 done, 1062.1494750976562 1040,5 done, 1062.4301352500916 1040,10 done, 1062.721010684967 1040,15 done, 1063.0033733844757 1045,0 done, 1067.403354883194 1045,5 done, 1067.684360742569 1045,10 done, 1067.964678287506 1045,15 done, 1068.2606790065765 1050,0 done, 1072.6394410133362 1050,5 done, 1072.9182677268982 1050,10 done, 1073.2000091075897 1050,15 done, 1073.49178481102 1055,0 done, 1077.8869438171387 1055,5 done, 1078.1658914089203 1055,10 done, 1078.458373785019 1055,15 done, 1078.748821735382 1060,0 done, 1083.129519701004 1060,5 done, 1083.4134600162506 1060,10 done, 1083.704926252365 1060,15 done, 1083.9911887645721 1065,0 done, 1088.3839826583862 1065,5 done, 1088.6764409542084 1065,10 done, 1088.9610440731049 1065,15 done, 1089.2446382045746 1070,0 done, 1093.6419093608856 1070,5 done, 1093.9129869937897 1070,10 done, 1094.1967902183533 1070,15 done, 1094.4655084609985 1075,0 done, 1098.770290851593 1075,5 done, 1099.0566771030426 1075,10 done, 1099.3425967693329 1075,15 done, 1099.6270215511322 1080,0 done, 1104.0077860355377 1080,5 done, 1104.2915165424347 1080,10 done, 1104.5860631465912 1080,15 done, 1104.8722350597382 1085,0 done, 1109.236074924469 1085,5 done, 1109.5148572921753 1085,10 done, 1109.7991902828217 1085,15 done, 1110.085428237915 1090,0 done, 1114.4971516132355 1090,5 done, 1114.7775840759277 1090,10 done, 1115.0680482387543 1090,15 done, 1115.3564898967743 1095,0 done, 1119.733363866806 1095,5 done, 1120.0189459323883 1095,10 done, 1120.3063745498657 1095,15 done, 1120.5882532596588 1100,0 done, 1124.9957702159882 1100,5 done, 1125.2812929153442 1100,10 done, 1125.5768175125122 1100,15 done, 1125.858867406845 1105,0 done, 1130.268595457077 1105,5 done, 1130.5482759475708 1105,10 done, 1130.823340177536 1105,15 done, 1131.0985162258148 1110,0 done, 1135.3941068649292 1110,5 done, 1135.6760725975037 1110,10 done, 1135.9476432800293 1110,15 done, 1136.2346460819244 1115,0 done, 1140.5023293495178 1115,5 done, 1140.7894322872162 1115,10 done, 1141.0632362365723 1115,15 done, 1141.3477919101715 1120,0 done, 1145.5996506214142 1120,5 done, 1145.8808016777039 1120,10 done, 1146.162737607956 1120,15 done, 1146.4395127296448 1125,0 done, 1150.7161781787872 1125,5 done, 1150.9943566322327 1125,10 done, 1151.2688057422638 1125,15 done, 1151.5495381355286 1130,0 done, 1155.8192601203918 1130,5 done, 1156.0978605747223 1130,10 done, 1156.373423576355 1130,15 done, 1156.6533255577087 1135,0 done, 1160.9056115150452 1135,5 done, 1161.187650680542 1135,10 done, 1161.4656240940094 1135,15 done, 1161.7416336536407 1140,0 done, 1166.0166244506836 1140,5 done, 1166.292489528656 1140,10 done, 1166.570408821106 1140,15 done, 1166.851573228836 1145,0 done, 1171.0578076839447 1145,5 done, 1171.331726551056 1145,10 done, 1171.612963438034 1145,15 done, 1171.8917789459229 1150,0 done, 1176.1561117172241 1150,5 done, 1176.4364132881165 1150,10 done, 1176.7107977867126 1150,15 done, 1176.987321138382 1155,0 done, 1181.2489268779755 1155,5 done, 1181.520659685135 1155,10 done, 1181.803949356079 1155,15 done, 1182.0831470489502 1160,0 done, 1186.347627878189 1160,5 done, 1186.6355175971985 1160,10 done, 1186.9172685146332 1160,15 done, 1187.1981542110443 1165,0 done, 1191.4484939575195 1165,5 done, 1191.724142074585 1165,10 done, 1191.9999377727509 1165,15 done, 1192.2816755771637 1170,0 done, 1196.5937259197235 1170,5 done, 1196.867217540741 1170,10 done, 1197.1487972736359 1170,15 done, 1197.4347579479218 1175,0 done, 1201.6635837554932 1175,5 done, 1201.945675611496 1175,10 done, 1202.224086523056 1175,15 done, 1202.501743555069 1180,0 done, 1206.7694919109344 1180,5 done, 1207.0450048446655 1180,10 done, 1207.325566291809 1180,15 done, 1207.5961349010468 1185,0 done, 1211.8595299720764 1185,5 done, 1212.1404039859772 1185,10 done, 1212.422812461853 1185,15 done, 1212.6959252357483 1190,0 done, 1216.9548873901367 1190,5 done, 1217.2315192222595 1190,10 done, 1217.5136504173279 1190,15 done, 1217.7897448539734 1195,0 done, 1222.0654270648956 1195,5 done, 1222.3316173553467 1195,10 done, 1222.6171822547913 1195,15 done, 1222.8862719535828 1200,0 done, 1227.198169708252 1200,5 done, 1227.472333908081 1200,10 done, 1227.7556042671204 1200,15 done, 1228.038906097412 1205,0 done, 1232.2589263916016 1205,5 done, 1232.5338411331177 1205,10 done, 1232.803099155426 1205,15 done, 1233.0916182994843 1210,0 done, 1237.3449256420135 1210,5 done, 1237.6192302703857 1210,10 done, 1237.897938966751 1210,15 done, 1238.1805198192596 1215,0 done, 1242.404221534729 1215,5 done, 1242.6863224506378 1215,10 done, 1242.9630308151245 1215,15 done, 1243.2446374893188 1220,0 done, 1247.5561499595642 1220,5 done, 1247.8288509845734 1220,10 done, 1248.115648984909 1220,15 done, 1248.3973772525787 1225,0 done, 1252.672382593155 1225,5 done, 1252.9438087940216 1225,10 done, 1253.2243266105652 1225,15 done, 1253.5034592151642 1230,0 done, 1257.8050847053528 1230,5 done, 1258.0927331447601 1230,10 done, 1258.3730342388153 1230,15 done, 1258.652156829834 1235,0 done, 1262.9399600028992 1235,5 done, 1263.210993051529 1235,10 done, 1263.4909036159515 1235,15 done, 1263.7753131389618 1240,0 done, 1268.0354998111725 1240,5 done, 1268.3196654319763 1240,10 done, 1268.5911135673523 1240,15 done, 1268.8780944347382 1245,0 done, 1273.143345117569 1245,5 done, 1273.4263741970062 1245,10 done, 1273.7052874565125 1245,15 done, 1273.977172613144 1250,0 done, 1278.2499623298645 1250,5 done, 1278.5354166030884 1250,10 done, 1278.8043982982635 1250,15 done, 1279.092277765274 1255,0 done, 1283.5106468200684 1255,5 done, 1283.781147480011 1255,10 done, 1284.0767843723297 1255,15 done, 1284.3738152980804 1260,0 done, 1288.7941825389862 1260,5 done, 1289.0830874443054 1260,10 done, 1289.3742253780365 1260,15 done, 1289.6644303798676 1265,0 done, 1294.0469517707825 1265,5 done, 1294.332862854004 1265,10 done, 1294.6230688095093 1265,15 done, 1294.903118610382 1270,0 done, 1299.3652052879333 1270,5 done, 1299.66392993927 1270,10 done, 1299.950447320938 1270,15 done, 1300.246829509735 1275,0 done, 1304.6893606185913 1275,5 done, 1304.9760165214539 1275,10 done, 1305.2732141017914 1275,15 done, 1305.56423163414 1280,0 done, 1309.9902613162994 1280,5 done, 1310.2789947986603 1280,10 done, 1310.5654938220978 1280,15 done, 1310.8554456233978 1285,0 done, 1315.3180947303772 1285,5 done, 1315.602243900299 1285,10 done, 1315.8937718868256 1285,15 done, 1316.190182209015 1290,0 done, 1320.6255223751068 1290,5 done, 1320.9153006076813 1290,10 done, 1321.1891181468964 1290,15 done, 1321.4846577644348 1295,0 done, 1325.9444034099579 1295,5 done, 1326.221796989441 1295,10 done, 1326.5213205814362 1295,15 done, 1326.8059651851654 1300,0 done, 1331.289895772934 1300,5 done, 1331.5779581069946 1300,10 done, 1331.8661901950836 1300,15 done, 1332.1664481163025 1305,0 done, 1336.6263513565063 1305,5 done, 1336.9078183174133 1305,10 done, 1337.1978058815002 1305,15 done, 1337.4912497997284 1310,0 done, 1341.9573850631714 1310,5 done, 1342.240403175354 1310,10 done, 1342.5242319107056 1310,15 done, 1342.8203570842743 1315,0 done, 1347.2777073383331 1315,5 done, 1347.5598759651184 1315,10 done, 1347.8512926101685 1315,15 done, 1348.1488103866577 1320,0 done, 1352.6020736694336 1320,5 done, 1352.8901252746582 1320,10 done, 1353.1793999671936 1320,15 done, 1353.47087931633 1325,0 done, 1357.870010137558 1325,5 done, 1358.1560714244843 1325,10 done, 1358.436113834381 1325,15 done, 1358.726315498352 1330,0 done, 1363.2116422653198 1330,5 done, 1363.4920978546143 1330,10 done, 1363.776396036148 1330,15 done, 1364.0685241222382 1335,0 done, 1368.555593252182 1335,5 done, 1368.8432149887085 1335,10 done, 1369.1355984210968 1335,15 done, 1369.419828414917 1340,0 done, 1373.880407333374 1340,5 done, 1374.1606640815735 1340,10 done, 1374.4566433429718 1340,15 done, 1374.7302160263062 1345,0 done, 1379.1529858112335 1345,5 done, 1379.4387147426605 1345,10 done, 1379.7273545265198 1345,15 done, 1380.005935907364 1350,0 done, 1384.456226825714 1350,5 done, 1384.7418467998505 1350,10 done, 1385.0337941646576 1350,15 done, 1385.320437669754 1355,0 done, 1389.7390677928925 1355,5 done, 1390.0194087028503 1355,10 done, 1390.3147356510162 1355,15 done, 1390.5938243865967 1360,0 done, 1395.0651240348816 1360,5 done, 1395.3431298732758 1360,10 done, 1395.6388103961945 1360,15 done, 1395.9253718852997 1365,0 done, 1400.3528988361359 1365,5 done, 1400.6398475170135 1365,10 done, 1400.9173476696014 1365,15 done, 1401.2045514583588 1370,0 done, 1405.6582164764404 1370,5 done, 1405.944248199463 1370,10 done, 1406.2306587696075 1370,15 done, 1406.5185101032257 1375,0 done, 1410.9868590831757 1375,5 done, 1411.2783670425415 1375,10 done, 1411.5689027309418 1375,15 done, 1411.8499228954315 1380,0 done, 1416.280156135559 1380,5 done, 1416.5612716674805 1380,10 done, 1416.8538286685944 1380,15 done, 1417.133064031601 1385,0 done, 1421.5894074440002 1385,5 done, 1421.8790233135223 1385,10 done, 1422.1646699905396 1385,15 done, 1422.4518551826477 1390,0 done, 1426.8892378807068 1390,5 done, 1427.1749420166016 1390,10 done, 1427.4678270816803 1390,15 done, 1427.7568488121033 1395,0 done, 1432.2238388061523 1395,5 done, 1432.5165808200836 1395,10 done, 1432.7975931167603 1395,15 done, 1433.0853350162506 1400,0 done, 1437.5340840816498 1400,5 done, 1437.8241851329803 1400,10 done, 1438.105785369873 1400,15 done, 1438.3948693275452 1405,0 done, 1442.8307745456696 1405,5 done, 1443.1074397563934 1405,10 done, 1443.3991956710815 1405,15 done, 1443.6927409172058 1410,0 done, 1448.1311299800873 1410,5 done, 1448.4112389087677 1410,10 done, 1448.7007024288177 1410,15 done, 1448.9871485233307 1415,0 done, 1453.4504115581512 1415,5 done, 1453.739330291748 1415,10 done, 1454.019003868103 1415,15 done, 1454.2978746891022 1420,0 done, 1458.7969460487366 1420,5 done, 1459.0788702964783 1420,10 done, 1459.3702671527863 1420,15 done, 1459.660877943039 1425,0 done, 1464.1276466846466 1425,5 done, 1464.4108002185822 1425,10 done, 1464.7030584812164 1425,15 done, 1464.9952533245087 1430,0 done, 1469.44908452034 1430,5 done, 1469.7254812717438 1430,10 done, 1470.019365310669 1430,15 done, 1470.3115231990814 1435,0 done, 1474.7442061901093 1435,5 done, 1475.0177021026611 1435,10 done, 1475.31063246727 1435,15 done, 1475.604742527008 1440,0 done, 1479.9793946743011 1440,5 done, 1480.2744054794312 1440,10 done, 1480.5588250160217 1440,15 done, 1480.848289012909 1445,0 done, 1485.3021142482758 1445,5 done, 1485.5885136127472 1445,10 done, 1485.8802309036255 1445,15 done, 1486.1670134067535 1450,0 done, 1490.6566007137299 1450,5 done, 1490.9395499229431 1450,10 done, 1491.2314400672913 1450,15 done, 1491.5234537124634 1455,0 done, 1495.9523499011993 1455,5 done, 1496.2443075180054 1455,10 done, 1496.5368492603302 1455,15 done, 1496.8190381526947 1460,0 done, 1501.1923081874847 1460,5 done, 1501.4696683883667 1460,10 done, 1501.754490852356 1460,15 done, 1502.0424206256866 1465,0 done, 1506.50576877594 1465,5 done, 1506.7909507751465 1465,10 done, 1507.0778632164001 1465,15 done, 1507.3689999580383 1470,0 done, 1511.842297077179 1470,5 done, 1512.1233472824097 1470,10 done, 1512.416803598404 1470,15 done, 1512.710957288742 1475,0 done, 1517.1613419055939 1475,5 done, 1517.4547715187073 1475,10 done, 1517.7403650283813 1475,15 done, 1518.0270719528198 1480,0 done, 1522.4738779067993 1480,5 done, 1522.766577720642 1480,10 done, 1523.0550026893616 1480,15 done, 1523.3425183296204 1485,0 done, 1527.8103561401367 1485,5 done, 1528.1033325195312 1485,10 done, 1528.3875641822815 1485,15 done, 1528.6712081432343 1490,0 done, 1533.0466799736023 1490,5 done, 1533.3303785324097 1490,10 done, 1533.6261901855469 1490,15 done, 1533.9132680892944 1495,0 done, 1538.3994448184967 1495,5 done, 1538.6864848136902 1495,10 done, 1538.9653491973877 1495,15 done, 1539.260353088379 1500,0 done, 1543.638282775879 1500,5 done, 1543.9250354766846 1500,10 done, 1544.214593410492 1500,15 done, 1544.5050191879272 1505,0 done, 1548.8984842300415 1505,5 done, 1549.1821494102478 1505,10 done, 1549.466402053833 1505,15 done, 1549.752501964569 1510,0 done, 1554.2031362056732 1510,5 done, 1554.493281841278 1510,10 done, 1554.7908072471619 1510,15 done, 1555.0753591060638 1515,0 done, 1559.5585503578186 1515,5 done, 1559.8379545211792 1515,10 done, 1560.1321606636047 1515,15 done, 1560.420693397522 1520,0 done, 1564.879062652588 1520,5 done, 1565.1623225212097 1520,10 done, 1565.451782464981 1520,15 done, 1565.7412202358246 1525,0 done, 1570.206297159195 1525,5 done, 1570.4942679405212 1525,10 done, 1570.7786943912506 1525,15 done, 1571.070872783661 1530,0 done, 1575.5144588947296 1530,5 done, 1575.8031005859375 1530,10 done, 1576.0884537696838 1530,15 done, 1576.3824422359467 1535,0 done, 1580.8448729515076 1535,5 done, 1581.128226518631 1535,10 done, 1581.4175453186035 1535,15 done, 1581.7058882713318 1540,0 done, 1586.164858341217 1540,5 done, 1586.4474453926086 1540,10 done, 1586.7444491386414 1540,15 done, 1587.0311632156372 1545,0 done, 1591.4871175289154 1545,5 done, 1591.7666375637054 1545,10 done, 1592.0594322681427 1545,15 done, 1592.3514547348022 1550,0 done, 1596.838164806366 1550,5 done, 1597.1234109401703 1550,10 done, 1597.4022674560547 1550,15 done, 1597.6992835998535 1555,0 done, 1602.1487746238708 1555,5 done, 1602.4302070140839 1555,10 done, 1602.7262332439423 1555,15 done, 1602.999845981598 1560,0 done, 1607.459373474121 1560,5 done, 1607.7379531860352 1560,10 done, 1608.0324068069458 1560,15 done, 1608.32715177536 1565,0 done, 1612.793131351471 1565,5 done, 1613.0726613998413 1565,10 done, 1613.3642733097076 1565,15 done, 1613.6548962593079 1570,0 done, 1618.106910943985 1570,5 done, 1618.3916292190552 1570,10 done, 1618.6894607543945 1570,15 done, 1618.974604845047 1575,0 done, 1623.3689367771149 1575,5 done, 1623.6664776802063 1575,10 done, 1623.9582056999207 1575,15 done, 1624.25022315979 1580,0 done, 1628.5396060943604 1580,5 done, 1628.8150129318237 1580,10 done, 1629.095269203186 1580,15 done, 1629.377382516861 1585,0 done, 1633.658089876175 1585,5 done, 1633.9359622001648 1585,10 done, 1634.2099792957306 1585,15 done, 1634.4954874515533 1590,0 done, 1638.7988607883453 1590,5 done, 1639.0726022720337 1590,10 done, 1639.3576691150665 1590,15 done, 1639.6377985477448 1595,0 done, 1643.923364162445 1595,5 done, 1644.1949224472046 1595,10 done, 1644.4807980060577 1595,15 done, 1644.7643427848816 1600,0 done, 1648.9949202537537 1600,5 done, 1649.2699205875397 1600,10 done, 1649.5517718791962 1600,15 done, 1649.8374943733215 1605,0 done, 1654.0992937088013 1605,5 done, 1654.38671541214 1605,10 done, 1654.6593968868256 1605,15 done, 1654.9437880516052 1610,0 done, 1659.1948356628418 1610,5 done, 1659.4742982387543 1610,10 done, 1659.755402803421 1610,15 done, 1660.0431714057922 1615,0 done, 1664.343198299408 1615,5 done, 1664.6266205310822 1615,10 done, 1664.9086818695068 1615,15 done, 1665.1851456165314 1620,0 done, 1669.4755675792694 1620,5 done, 1669.754335641861 1620,10 done, 1670.0300176143646 1620,15 done, 1670.3133940696716 1625,0 done, 1674.6247735023499 1625,5 done, 1674.9123027324677 1625,10 done, 1675.1876277923584 1625,15 done, 1675.4713144302368 1630,0 done, 1679.7762053012848 1630,5 done, 1680.0508410930634 1630,10 done, 1680.3339908123016 1630,15 done, 1680.6247470378876 1635,0 done, 1684.9116768836975 1635,5 done, 1685.1884744167328 1635,10 done, 1685.47207736969 1635,15 done, 1685.758861064911 1640,0 done, 1690.0661549568176 1640,5 done, 1690.3442685604095 1640,10 done, 1690.6331582069397 1640,15 done, 1690.918221950531 1645,0 done, 1695.2315516471863 1645,5 done, 1695.5156552791595 1645,10 done, 1695.793860912323 1645,15 done, 1696.0813529491425 1650,0 done, 1700.4476068019867 1650,5 done, 1700.7275772094727 1650,10 done, 1701.0119335651398 1650,15 done, 1701.28173828125 1655,0 done, 1705.5868134498596 1655,5 done, 1705.8665626049042 1655,10 done, 1706.1491515636444 1655,15 done, 1706.4296061992645 1660,0 done, 1710.7366631031036 1660,5 done, 1711.0222730636597 1660,10 done, 1711.2992975711823 1660,15 done, 1711.585297346115 1665,0 done, 1715.8622040748596 1665,5 done, 1716.135176897049 1665,10 done, 1716.4167003631592 1665,15 done, 1716.7077248096466 1670,0 done, 1720.982511997223 1670,5 done, 1721.2617268562317 1670,10 done, 1721.5397174358368 1670,15 done, 1721.8112037181854 1675,0 done, 1726.082765340805 1675,5 done, 1726.3536608219147 1675,10 done, 1726.6240792274475 1675,15 done, 1726.899139881134 1680,0 done, 1731.1563973426819 1680,5 done, 1731.4409003257751 1680,10 done, 1731.731808423996 1680,15 done, 1732.0064458847046 1685,0 done, 1736.2261145114899 1685,5 done, 1736.497272014618 1685,10 done, 1736.7817151546478 1685,15 done, 1737.053953409195 1690,0 done, 1741.365031003952 1690,5 done, 1741.6398768424988 1690,10 done, 1741.923192024231 1690,15 done, 1742.2077927589417 1695,0 done, 1746.493324995041 1695,5 done, 1746.7710993289948 1695,10 done, 1747.0352575778961 1695,15 done, 1747.3261651992798 1700,0 done, 1751.7026813030243 1700,5 done, 1751.9838466644287 1700,10 done, 1752.266129732132 1700,15 done, 1752.5413556098938 1705,0 done, 1756.8853437900543 1705,5 done, 1757.1559665203094 1705,10 done, 1757.4530477523804 1705,15 done, 1757.7383739948273 1710,0 done, 1762.2250459194183 1710,5 done, 1762.5164391994476 1710,10 done, 1762.7980518341064 1710,15 done, 1763.0865623950958 1715,0 done, 1767.5114269256592 1715,5 done, 1767.7921414375305 1715,10 done, 1768.0839972496033 1715,15 done, 1768.3746898174286 1720,0 done, 1772.7811403274536 1720,5 done, 1773.0634455680847 1720,10 done, 1773.3448324203491 1720,15 done, 1773.6367201805115 1725,0 done, 1778.1351535320282 1725,5 done, 1778.4337902069092 1725,10 done, 1778.721649646759 1725,15 done, 1779.0179092884064 1730,0 done, 1783.5315346717834 1730,5 done, 1783.8199300765991 1730,10 done, 1784.1041901111603 1730,15 done, 1784.3871011734009 1735,0 done, 1788.7118699550629 1735,5 done, 1788.9948437213898 1735,10 done, 1789.2687556743622 1735,15 done, 1789.56090259552 1740,0 done, 1794.0425453186035 1740,5 done, 1794.3427095413208 1740,10 done, 1794.6251349449158 1740,15 done, 1794.9177000522614 1745,0 done, 1799.3805117607117 1745,5 done, 1799.6638362407684 1745,10 done, 1799.9652247428894 1745,15 done, 1800.248604774475 1750,0 done, 1804.736929178238 1750,5 done, 1805.0271327495575 1750,10 done, 1805.313205242157 1750,15 done, 1805.6027328968048 1755,0 done, 1810.0428881645203 1755,5 done, 1810.3235921859741 1755,10 done, 1810.6056263446808 1755,15 done, 1810.8874933719635 1760,0 done, 1815.28693151474 1760,5 done, 1815.5863530635834 1760,10 done, 1815.8716411590576 1760,15 done, 1816.1379835605621 1765,0 done, 1820.498405456543 1765,5 done, 1820.7805151939392 1765,10 done, 1821.0755670070648 1765,15 done, 1821.3610372543335 1770,0 done, 1825.6982960700989 1770,5 done, 1825.979425907135 1770,10 done, 1826.249035835266 1770,15 done, 1826.528493642807 1775,0 done, 1830.8511908054352 1775,5 done, 1831.125937461853 1775,10 done, 1831.4026646614075 1775,15 done, 1831.6832506656647 1780,0 done, 1836.009681224823 1780,5 done, 1836.2952706813812 1780,10 done, 1836.5763463974 1780,15 done, 1836.8522922992706 1785,0 done, 1841.1721234321594 1785,5 done, 1841.4476647377014 1785,10 done, 1841.7438611984253 1785,15 done, 1842.0183806419373 1790,0 done, 1846.3699111938477 1790,5 done, 1846.6484065055847 1790,10 done, 1846.935781955719 1790,15 done, 1847.2180392742157 1795,0 done, 1851.528379201889 1795,5 done, 1851.816967010498 1795,10 done, 1852.100044965744 1795,15 done, 1852.3791456222534 1800,0 done, 1856.7189676761627 1800,5 done, 1857.0005943775177 1800,10 done, 1857.2817344665527 1800,15 done, 1857.5671706199646 1805,0 done, 1861.9233078956604 1805,5 done, 1862.2011711597443 1805,10 done, 1862.4832408428192 1805,15 done, 1862.7702803611755 1810,0 done, 1867.0914704799652 1810,5 done, 1867.3716359138489 1810,10 done, 1867.6560761928558 1810,15 done, 1867.9405903816223 1815,0 done, 1872.3122537136078 1815,5 done, 1872.5958116054535 1815,10 done, 1872.881212234497 1815,15 done, 1873.1608266830444 1820,0 done, 1877.7236602306366 1820,5 done, 1877.9994888305664 1820,10 done, 1878.2660038471222 1820,15 done, 1878.555434703827 1825,0 done, 1882.8460788726807 1825,5 done, 1883.1310040950775 1825,10 done, 1883.4048540592194 1825,15 done, 1883.6852207183838
In [85]:
len(flattened)
Out[85]:
165478
In [86]:
tmp = pd.DataFrame(flattened,columns = ['new_id'])
tmp = pd.merge(tmp,df[['new_id','any_canonic_motif']],how='left',on='new_id')
motif_threshold,len(tmp),tmp['any_canonic_motif'].sum()/len(tmp)
Out[86]:
(0.72, 165478, 0.5836364954858048)
In [88]:
195574/165478
Out[88]:
1.1818731190853164
In [96]:
tmp.to_csv(tmp_dir+'ts.scoring.tsv',sep=str('\t'),header=True,index=None,quoting=csv.QUOTE_NONE)
In [97]:
len(tmp)
Out[97]:
165478
In [100]:
tmp = pd.merge(tmp,df[['new_id','t']],how='left',on='new_id')
In [105]:
tmp = pd.merge(tmp,df[['new_id','gene_id']],how='left',on='new_id')
In [107]:
tmp['t1'] = 1
tmp = pd.merge(tmp.drop('t1',1),tmp.groupby('gene_id').agg({'t1':sum}).reset_index(),how='inner',on='gene_id')
In [108]:
len(tmp.loc[tmp['t1']<4])/len(tmp)
Out[108]:
0.10560920484898295
In [111]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))
tmp['t1'] = 1
gr = tmp.groupby('gene_id').agg({'t1':sum}).reset_index()
ax = sns.histplot(gr['t1'],stat='density')
ax.set(xlabel = '# filtered PAS in the gene', title='points are GENES')
Out[111]:
[Text(0.5, 0, '# filtered PAS in the gene'), Text(0.5, 1.0, 'points are GENES')]
In [114]:
gr['t1'].quantile(0.5)
Out[114]:
4.0
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))
ax = sns.histplot(df['t'],stat='density')
ax.set(xlabel = '# PAS in the gene')
Final quantification and figure about comparison with v2 and DL-retrieved atlas¶
In [ ]:
In [ ]:
In [ ]:
In [50]:
In [34]:
expr_feature = feature+'_sum'
scoring_feature = feature+'_ratio'
data['expr_cat'] = pd.qcut(data[expr_feature],q=5)
for expr_cat in list(data['expr_cat'].unique()):
tmp = data.loc[data['expr_cat']==expr_cat].reset_index(drop=True)
tmp = tmp[['new_id',scoring_feature,'any_canonic_motif']].sort_values(scoring_feature,ascending=False).reset_index(drop=True)
tmp['t']=1
tmp['t_cumsum'] = tmp['t'].cumsum()
tmp['any_canonic_motif_cumul'] = tmp['any_canonic_motif'].cumsum()
tmp['frac_cumul'] = tmp['any_canonic_motif_cumul']/tmp['t_cumsum']
if tmp['frac_cumul'].max()>=motif_threshold:
max_index = max(tmp.loc[tmp['frac_cumul']>motif_threshold].index)
pas_to_append = list(tmp.loc[0:max_index]['new_id'])
In [32]:
Counter(data['any_canonic_motif'])
Out[32]:
Counter({0: 83380, 1: 16620})
In [ ]:
In [ ]:
def get_PAS_scores(L,sample,anchor_df,iterator):
tmp = PAQR_median_expression_matrix.loc[PAQR_median_expression_matrix[sample]!=-1][['Row.names','exon',sample]].dropna().reset_index(drop=True)
tmp[sample] = tmp[sample].astype('int')
tmp = pd.merge(tmp,tmp.groupby('exon').agg({sample:sum}).reset_index().rename(columns={sample:'exon_sum'}),how='inner',on='exon')
tmp = tmp.loc[tmp['exon_sum']>0].reset_index(drop=True)
tmp['score'] = tmp.apply(lambda x:np.log10(stats.binomtest(x[sample], x['exon_sum'], p=0.5, alternative='greater').pvalue+10**(-300))*(-1)-np.log10(stats.binomtest(x[sample], x['exon_sum'], p=0.5, alternative='less').pvalue+10**(-300))*(-1),1)
tmp = tmp[['Row.names','score']].rename(columns={'score':sample})
tmp = pd.merge(anchor_df,tmp,how='left',on='Row.names')[[sample]]
L.append(tmp)
if iterator%100==0:
print(str(iterator)+' done, '+str(time.time()-start_time))
start_time = time.time()
anchor_df = PAQR_median_expression_matrix[['Row.names']]
with Manager() as manager:
L = manager.list()
processes = []
i=0
for sample in list_of_samples:
p = Process(target=get_PAS_scores, args=(L,sample,anchor_df,i)) # Passing the list
p.start()
processes.append(p)
i=i+1
for p in processes:
p.join()
L = list(L)
res = pd.concat([anchor_df[['Row.names']]]+L,axis=1)
print(time.time()-start_time)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [142]:
feature = 'score'
y_feature = feature+'_ratio'
x_feature = feature+'_sum'
x_feature_log = x_feature+'_log'
data = df.loc[df['t']==2][['new_id','gene_id','any_canonic_motif','t']+[feature,y_feature,x_feature]].copy()
data[x_feature_log] = np.log2(data[x_feature]+10**(-30))
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=False, sharex=True,figsize=(3,3))
ax = sns.scatterplot(data = data.sample(1000),x=x_feature_log,y=y_feature,s=3)
In [143]:
data.head()
Out[143]:
| new_id | gene_id | any_canonic_motif | t | score | score_ratio | score_sum | score_sum_log | |
|---|---|---|---|---|---|---|---|---|
| 16 | 290 | ENSG00000279928.2 | 1 | 2 | 0.116169 | 0.308413 | 0.376666 | -1.408642 |
| 17 | 291 | ENSG00000279928.2 | 0 | 2 | 0.260497 | 0.691587 | 0.376666 | -1.408642 |
| 33 | 855 | ENSG00000268663.1 | 0 | 2 | 0.124622 | 0.342533 | 0.363825 | -1.458682 |
| 34 | 858 | ENSG00000268663.1 | 0 | 2 | 0.239203 | 0.657467 | 0.363825 | -1.458682 |
| 11279 | 27289 | ENSG00000260972.1 | 1 | 2 | 2.564076 | 0.438579 | 5.846327 | 2.547530 |
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [6]:
In [7]:
merged_pas_motif_table = pd.merge(SCINPAS_full[['seqid','start','end','id','score']+tissues].rename(columns={'score':'score_1'}),
merged_pas_motif_table,how='inner',on=['id'])
In [8]:
merged_pas_motif_table = merged_pas_motif_table.drop('score',1).rename(columns={'score_1':'score'}) # score_1 from SCINPAS
In [9]:
len(merged_pas_motif_table)
Out[9]:
18432135
In [10]:
cols = list(merged_pas_motif_table.columns)
motifs = cols[-15:-4]
In [11]:
merged_pas_motif_table[motifs] = merged_pas_motif_table[motifs].astype('boolean')
merged_pas_motif_table['any_canonic_motif'] = (merged_pas_motif_table[motifs].sum(1)>0).astype('int')
In [12]:
Out[12]:
| seqid | start | end | id | score | strand | class | gene_id | gene_name | nose | ... | kidney | penis | ureter | lung | liver | skin | prostate | uterus | bloodImmune | brain | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | chr1 | 9999 | 10000 | chr1:10000:-:9999:10000:0.5596258722876915:3 | 0.559626 | - | true_intergenic | NaN | NaN | 0.0 | ... | 0.0 | 0.000000 | 0.006918 | 0.0 | 0.000000 | 0.010019 | 0.0 | 0.0 | 0.000000 | 0.000000 |
| 1 | chr1 | 10464 | 10465 | chr1:10465:+:10464:10465:0.401783597747119:1 | 0.401784 | + | true_intergenic | NaN | NaN | 0.0 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.004415 |
| 2 | chr1 | 11533 | 11534 | chr1:11534:-:11533:11534:0.2873875380716641:1 | 0.287388 | - | true_intergenic | NaN | NaN | 0.0 | ... | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.001145 | 0.000000 |
| 3 | chr1 | 14350 | 14370 | chr1:14355:-:14350:14370:52.50382345788897:82 | 52.503823 | - | antisense_TE | NaN | NaN | 0.0 | ... | 0.0 | 0.085025 | 0.033230 | 0.0 | 0.081067 | 0.128423 | 0.0 | 0.0 | 0.139736 | 0.000000 |
| 4 | chr1 | 14395 | 14425 | chr1:14403:-:14395:14425:283.585503076957:117 | 283.585503 | - | TE | ENSG00000227232.5 | WASH7P | 0.0 | ... | 0.0 | 0.000000 | 0.015700 | 0.0 | 0.013593 | 0.000000 | 0.0 | 0.0 | 1.081058 | 0.034856 |
5 rows × 27 columns
In [ ]:
merged_pas_motif_table.head()
In [ ]:
In [ ]:
In [77]:
In [66]:
In [ ]:
In [35]:
merged_pas_motif_table['score_cat'] = pd.qcut(merged_pas_motif_table['score'],10)
In [46]:
merged_pas_motif_table['score'].max()
Out[46]:
21713.1321024367
In [54]:
a = []
for quantile in list(pd.Series(range(0,101,2))/100)[:-1]:
threshold = c['score'].quantile(quantile)
tmp = merged_pas_motif_table.loc[merged_pas_motif_table['score']>threshold]
n,m = len(tmp),tmp['any_canonic_motif'].sum()
a.append([quantile,threshold,n,m/n])
In [68]:
len(polyAsite)/10**6
Out[68]:
0.569005
In [65]:
np.round(len(merged_pas_motif_table)/len(polyAsite),1)
Out[65]:
2.2
In [55]:
pd.DataFrame(a)
Out[55]:
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | 0.00 | 0.023074 | 1228808 | 0.311388 |
| 1 | 0.02 | 0.023687 | 1204232 | 0.312293 |
| 2 | 0.04 | 0.024322 | 1179656 | 0.313075 |
| 3 | 0.06 | 0.024986 | 1155080 | 0.314088 |
| 4 | 0.08 | 0.025688 | 1130504 | 0.315049 |
| 5 | 0.10 | 0.026429 | 1105928 | 0.316039 |
| 6 | 0.12 | 0.027216 | 1081352 | 0.316867 |
| 7 | 0.14 | 0.028043 | 1056775 | 0.317816 |
| 8 | 0.16 | 0.028922 | 1032199 | 0.318822 |
| 9 | 0.18 | 0.029843 | 1007623 | 0.319764 |
| 10 | 0.20 | 0.030826 | 983047 | 0.320675 |
| 11 | 0.22 | 0.031868 | 958471 | 0.321649 |
| 12 | 0.24 | 0.032917 | 933895 | 0.323047 |
| 13 | 0.26 | 0.033967 | 909315 | 0.324519 |
| 14 | 0.28 | 0.035115 | 884742 | 0.325992 |
| 15 | 0.30 | 0.036380 | 860166 | 0.327324 |
| 16 | 0.32 | 0.037750 | 835590 | 0.328658 |
| 17 | 0.34 | 0.039132 | 811014 | 0.330296 |
| 18 | 0.36 | 0.040457 | 786438 | 0.332732 |
| 19 | 0.38 | 0.041920 | 761861 | 0.335018 |
| 20 | 0.40 | 0.043527 | 737285 | 0.337073 |
| 21 | 0.42 | 0.045256 | 712709 | 0.339137 |
| 22 | 0.44 | 0.047148 | 688133 | 0.341223 |
| 23 | 0.46 | 0.049229 | 663557 | 0.343313 |
| 24 | 0.48 | 0.051434 | 638981 | 0.345652 |
| 25 | 0.50 | 0.053772 | 614404 | 0.348209 |
| 26 | 0.52 | 0.056326 | 589828 | 0.350862 |
| 27 | 0.54 | 0.058981 | 565252 | 0.354000 |
| 28 | 0.56 | 0.061785 | 540676 | 0.357208 |
| 29 | 0.58 | 0.064720 | 516100 | 0.360610 |
| 30 | 0.60 | 0.068047 | 491524 | 0.364243 |
| 31 | 0.62 | 0.071745 | 466948 | 0.368268 |
| 32 | 0.64 | 0.075892 | 442371 | 0.372698 |
| 33 | 0.66 | 0.080524 | 417795 | 0.377281 |
| 34 | 0.68 | 0.085789 | 393219 | 0.382138 |
| 35 | 0.70 | 0.091739 | 368643 | 0.387535 |
| 36 | 0.72 | 0.098589 | 344067 | 0.393752 |
| 37 | 0.74 | 0.106429 | 319491 | 0.400111 |
| 38 | 0.76 | 0.115246 | 294914 | 0.407780 |
| 39 | 0.78 | 0.125782 | 270338 | 0.416157 |
| 40 | 0.80 | 0.138443 | 245762 | 0.425965 |
| 41 | 0.82 | 0.153796 | 221186 | 0.438332 |
| 42 | 0.84 | 0.173340 | 196610 | 0.452947 |
| 43 | 0.86 | 0.198312 | 172034 | 0.470698 |
| 44 | 0.88 | 0.231893 | 147457 | 0.493018 |
| 45 | 0.90 | 0.278884 | 122881 | 0.519934 |
| 46 | 0.92 | 0.352077 | 98305 | 0.557571 |
| 47 | 0.94 | 0.482353 | 73729 | 0.612635 |
| 48 | 0.96 | 0.783611 | 49153 | 0.697658 |
| 49 | 0.98 | 2.161783 | 24577 | 0.829637 |
In [ ]:
In [31]:
Counter(merged_pas_motif_table['any_canonic_motif'])
Out[31]:
Counter({1: 382636, 0: 846173})
In [59]:
382636/(382636+846173)
Out[59]:
0.31138769328675164
In [ ]:
In [17]:
In [ ]:
In [ ]:
In [4]:
merged_pas_motif_table.head()
Out[4]:
| score | class | AAUAAA | AUUAAA | UAUAAA | AGUAAA | AAUACA | AAUAUA | CAUAAA | GAUAAA | ACUAAA | AAUAGA | phastcon | entropy | num_cs | width | any_canonic_motif | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.559626 | true_intergenic | False | False | False | False | False | False | False | False | False | False | 0.000000 | 0.000000 | 1 | 1 | 0 |
| 1 | 0.287388 | true_intergenic | False | True | False | False | False | False | False | False | False | False | 0.010700 | 0.000000 | 1 | 1 | 1 |
| 2 | 52.503823 | antisense_TE | True | False | False | False | False | False | False | False | False | False | 0.003725 | 0.618032 | 17 | 20 | 1 |
| 3 | 283.585503 | TE | True | False | False | False | False | False | False | False | False | False | 0.001900 | 0.391705 | 21 | 30 | 1 |
| 4 | 0.153752 | TE | False | False | False | False | False | False | False | False | False | False | 0.060475 | 0.000000 | 1 | 1 | 0 |
In [ ]:
In [ ]:
In [ ]:
def gini(x):
return 1-np.sum([elem**2 for elem in x])
In [1]:
merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [4,6,7,8,9,11,12,13,14,16,17,18,19,20,21,22])
merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category')
cols = list(merged_pas_motif_table.columns)
merged_pas_motif_table[cols[2:-4]] = merged_pas_motif_table[cols[2:-4]].astype('boolean')
merged_pas_motif_table['any_canonic_motif'] = merged_pas_motif_table[cols[2:-4]].sum(1)
merged_pas_motif_table['any_canonic_motif'] = merged_pas_motif_table['any_canonic_motif'].astype('int')
merged_pas_motif_table['phastcon_quant'] = pd.qcut(merged_pas_motif_table['phastcon'],q=15,labels = list('q'+pd.Series(range(1,16)).astype('str')))
def gini(x):
return 1-np.sum([elem**2 for elem in x])
df = pd.DataFrame(merged_pas_motif_table['num_cs'].unique(),columns=['num_cs'])
df['max_gini'] = df.apply(lambda x:gini([1/x['num_cs']]*x['num_cs']),1)
merged_pas_motif_table = pd.merge(merged_pas_motif_table,df,how='left',on='num_cs')
merged_pas_motif_table = merged_pas_motif_table.rename(columns = {'entropy':'gini'})
merged_pas_motif_table['max_gini'] = merged_pas_motif_table['max_gini']+10**(-5)
merged_pas_motif_table['normal_gini'] = merged_pas_motif_table['gini']/merged_pas_motif_table['max_gini']
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In[1], line 1 ----> 1 merged_pas_motif_table = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis_2/result/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",index_col=None,header=0,usecols = [4,6,7,8,9,11,12,13,14,16,17,18,19,20,21,22]) 2 merged_pas_motif_table['class'] = merged_pas_motif_table['class'].astype('category') 3 cols = list(merged_pas_motif_table.columns) NameError: name 'pd' is not defined
In [3]:
merged_pas_motif_table['num_cs'] = merged_pas_motif_table['num_cs'].astype('category')
merged_pas_motif_table['width'] = merged_pas_motif_table['width'].astype('category')
In [4]:
merged_pas_motif_table['RPM_log2'] = np.log2(merged_pas_motif_table['score'])
In [89]:
merged_pas_motif_table['RPM_quant'] = pd.qcut(merged_pas_motif_table['RPM_log2'],q=100,labels = list('q'+pd.Series(range(1,101)).astype('str')))
In [33]:
merged_pas_motif_table['normal_gini_bin'] = pd.cut(merged_pas_motif_table['normal_gini'],bins=20)
In [7]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))
data = merged_pas_motif_table.sample(200000)
data['t']=1
gr = data.groupby('num_cs').agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)
x_feature = 'num_cs'
ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%',color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',estimator=np.median,color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',estimator=np.median,color='blue')
ax.set(xlabel='# of cleavage sites in PAS')
Out[7]:
[Text(0.5, 0, '# of cleavage sites in PAS')]
In [113]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))
x_feature = 'width'
data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs']==2].sample(200000)
data['t']=1
gr = data.groupby(x_feature).agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)
ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%',color='blue')
ax.set(xlabel='',title='PAS having 2 cleavage sites')
ax = sns.pointplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',estimator=np.median,color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',estimator=np.median,color='blue')
ax.set(xlabel='PAS width, nt')
Out[113]:
[Text(0.5, 0, 'PAS width, nt')]
In [115]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))
x_feature = 'width'
data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs']==3].sample(200000)
data['t']=1
gr = data.groupby(x_feature).agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)
ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%',color='blue')
ax.set(xlabel='',title='PAS having 2 cleavage sites')
ax = sns.pointplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',estimator=np.median,color='blue')
ax.set(xlabel='')
ax = sns.pointplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',estimator=np.median,color='blue')
ax.set(xlabel='PAS width, nt')
Out[115]:
[Text(0.5, 0, 'PAS width, nt')]
In [8]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(3,1, sharey=False, sharex=True,figsize=(15,5))
data = merged_pas_motif_table.sample(200000)
data['t']=1
gr = data.groupby('num_cs').agg({'t':np.sum}).reset_index()
gr['%'] = np.round(gr['t']/gr['t'].sum()*100,2)
x_feature = 'num_cs'
ax = sns.barplot(ax = axes[0],data = gr,x=x_feature,y='%')
ax.set(xlabel='')
ax = sns.boxplot(ax = axes[1],data = data, x=x_feature,y='RPM_log2',showfliers=False)
ax.set(xlabel='')
ax = sns.boxplot(ax = axes[2],data = data, x=x_feature,y='normal_gini',showfliers=False)
ax.set(xlabel='# of cleavage sites in PAS')
Out[8]:
[Text(0.5, 0, '# of cleavage sites in PAS')]
In [100]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,3, sharey=True, sharex=True,figsize=(10,3))
i = 0
for num_cs in [2,3,4]:
data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs']==num_cs].sample(100000)
data['RPM_quant'] = pd.qcut(data['RPM_log2'],q=15,labels = list('q'+pd.Series(range(1,16)).astype('str')))
ax = sns.histplot(ax=axes[i], data = data,x='RPM_log2',y='normal_gini',stat='density')
ax.set(title = str(num_cs)+' cleavage sites')
if i>0:
ax.set(ylabel='')
i=i+1
fig.tight_layout(pad=0.5)
In [94]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=True, sharex=True,figsize=(10,3))
data = merged_pas_motif_table.loc[merged_pas_motif_table['num_cs'].isin([2,3,4,5])]
data['RPM_quant'] = pd.qcut(data['RPM_log2'],q=10,labels = list('q'+pd.Series(range(1,11)).astype('str')))
ax = sns.boxplot(data = data,x='RPM_quant',y='normal_gini',hue='num_cs',showfliers=False,saturation=1)
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='# of cleavage\nsites',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(xlabel = 'total RPM quantile, in a given # of cleavage sites')
Out[94]:
[Text(0.5, 0, 'total RPM quantile, in a given # of cleavage sites')]
In [119]:
len(merged_pas_motif_table)*0.06*0.2
Out[119]:
221185.62
In [9]:
merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].astype('str')=='q15']['score'].min()
Out[9]:
18.75919782185748
In [123]:
18/900
Out[123]:
0.02
In [86]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,2, sharey=False, sharex=False,figsize=(8,5))
data = merged_pas_motif_table.sample(20000)
x_feature = 'RPM_log2'
ax = sns.histplot(ax=axes[0],data = data, x = x_feature,stat='density',hue='RPM_quant',alpha=1,legend=False)
x_feature = 'phastcon'
ax = sns.histplot(ax=axes[1],data = data, x = x_feature,stat='density',hue='phastcon_quant',alpha=1)
In [102]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,6, sharey=False, sharex=False,figsize=(15,3))
data = merged_pas_motif_table.loc[merged_pas_motif_table['class'].isin(['TE'])].sample(20000)
ax = sns.boxplot(ax=axes[0],data = data, x = 'RPM_log2',y='phastcon_quant',showfliers=False,saturation=1)
xval = data.loc[data['phastcon_quant']=='q1']['RPM_log2'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within TE')
ax = sns.boxplot(ax=axes[1],data = data, x = 'phastcon',y='RPM_quant',showfliers=False,saturation=1)
xval = data.loc[data['RPM_quant']=='q1']['phastcon'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within TE')
data = merged_pas_motif_table.loc[merged_pas_motif_table['class'].isin(['intronic'])].sample(20000)
ax = sns.boxplot(ax=axes[2],data = data, x = 'RPM_log2',y='phastcon_quant',showfliers=False,saturation=1)
xval = data.loc[data['phastcon_quant']=='q1']['RPM_log2'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intronic')
ax = sns.boxplot(ax=axes[3],data = data, x = 'phastcon',y='RPM_quant',showfliers=False,saturation=1)
xval = data.loc[data['RPM_quant']=='q1']['phastcon'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intronic')
data = merged_pas_motif_table.loc[merged_pas_motif_table['class'].isin(['true_intergenic'])].sample(20000)
ax = sns.boxplot(ax=axes[4],data = data, x = 'RPM_log2',y='phastcon_quant',showfliers=False,saturation=1)
xval = data.loc[data['phastcon_quant']=='q1']['RPM_log2'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intergenic')
ax = sns.boxplot(ax=axes[5],data = data, x = 'phastcon',y='RPM_quant',showfliers=False,saturation=1)
xval = data.loc[data['RPM_quant']=='q1']['phastcon'].median()
ax.vlines(x=xval,ymin=ax.get_ylim()[0],ymax=ax.get_ylim()[1],color='red',linestyles='--',linewidth=1)
ax.set(title = 'within intergenic')
fig.tight_layout(pad=0.5)
In [14]:
def entropy(x):
return (-1)*np.sum([elem*np.log2(elem) for elem in x])
def normalized_entropy(x):
ent = entropy(x)
max_ent = entropy([1/len(x)]*len(x))
return ent/max_ent
def gini(x):
return 1-np.sum([elem**2 for elem in x])
l = [0.0001,0.999,0.0003,0.0006]
entropy(l),normalized_entropy(l),gini(l)
Out[14]:
(0.012703219581699458, 0.006351609790849729, 0.001998539999999882)
In [43]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=False, sharex=False,figsize=(3,3))
x_feature = 'RPM_log2'
ax = sns.scatterplot(data = merged_pas_motif_table.sample(20000), x = 'RPM_log2',y='phastcon',s=5,alpha=0.5)
In [68]:
order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic']
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,len(order), sharey=True, sharex=True,figsize=(15,3))
x_feature = 'RPM_log2'
i=0
for class_ in order:
ax = sns.histplot(ax = axes[i],data = merged_pas_motif_table.loc[merged_pas_motif_table['class']==class_].sample(1000),x = x_feature,stat='density',alpha=1)
ax.set(title=class_,xlabel=x_feature)
i=i+1
In [91]:
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('class').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='class')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=False, sharex=True,figsize=(20,4))
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue='RPM_quant',order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=5,fontsize=9,mode=None)
ax.set(ylabel = '% within class',xlabel='')
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('RPM_quant').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='RPM_quant')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue='RPM_quant',order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
xmin,xmax = ax.get_xlim()[0],ax.get_xlim()[1]
ax.hlines(y=7,xmin=xmin,xmax=xmax,color='grey',linestyles='-',linewidth=0.7)
ax.hlines(y=3,xmin=xmin,xmax=xmax,color='grey',linestyles='-',linewidth=0.7)
ax.text(6.7,7,'7%',ha='right',va='bottom',size=8)
ax.text(6.7,3,'3%',ha='right',va='center',size=8)
ax.legend_.remove()
ax.set(ylabel = '% within\nexpression quantile')
Out[91]:
[Text(0, 0.5, '% within\nexpression quantile')]
In [120]:
merged_pas_motif_table['RPM_quant'] = pd.cut(merged_pas_motif_table['RPM_log2'],bins=100)
In [122]:
merged_pas_motif_table.head()
Out[122]:
| score | class | AAUAAA | AUUAAA | UAUAAA | AGUAAA | AAUACA | AAUAUA | CAUAAA | GAUAAA | ... | any_canonic_motif | phastcon_quant | max_gini | normal_gini | RPM_log2 | RPM_quant | normal_gini_bin | t | num_cs_bins | any_canonic_motif_pres | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.559626 | true_intergenic | False | False | False | False | False | False | False | False | ... | 0 | q1 | 0.000010 | 0.000000 | -0.837465 | (-1.005, -0.71] | (-0.001, 0.05] | 1 | (0, 1] | False |
| 1 | 0.287388 | true_intergenic | False | True | False | False | False | False | False | False | ... | 1 | q7 | 0.000010 | 0.000000 | -1.798931 | (-1.89, -1.595] | (-0.001, 0.05] | 1 | (0, 1] | True |
| 2 | 52.503823 | antisense_TE | True | False | False | False | False | False | False | False | ... | 1 | q4 | 0.941186 | 0.656652 | 5.714351 | (5.486, 5.781] | (0.65, 0.7] | 1 | (8, 51] | True |
| 3 | 283.585503 | TE | True | False | False | False | False | False | False | False | ... | 1 | q3 | 0.952391 | 0.411286 | 8.147640 | (8.141, 8.436] | (0.4, 0.45] | 1 | (8, 51] | True |
| 4 | 0.153752 | TE | False | False | False | False | False | False | False | False | ... | 0 | q11 | 0.000010 | 0.000000 | -2.701319 | (-2.776, -2.481] | (-0.001, 0.05] | 1 | (0, 1] | False |
5 rows × 26 columns
In [ ]:
In [126]:
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(7,1, sharey=False, sharex=True,figsize=(10,10))
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('RPM_quant').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='RPM_quant')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
i=0
for class_name in ['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic']:
ax = sns.pointplot(ax=axes[i],data = gr.loc[gr['class']==class_name],x='RPM_quant',y='%')
ax.set(xlabel='')
i=i+1
ax.set(ylabel = '% within\nexpression quantile')
Out[126]:
[Text(0, 0.5, '% within\nexpression quantile')]
In [ ]:
In [25]:
feature = 'phastcon_quant'
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby('class').agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on='class')
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=False, sharex=True,figsize=(13,4))
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within class',xlabel='')
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within\nphast cons quantile')
Out[25]:
[Text(0, 0.5, '% within\nphast cons quantile')]
In [26]:
merged_pas_motif_table['num_cs_bins'] = pd.cut(merged_pas_motif_table['num_cs'],bins = [0,1,2,3,4,5,6,7,8,51])
In [27]:
feature = 'num_cs_bins'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,4))
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q15'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='group:\n# of cleavage sites',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within group',xlabel='',title = 'within top expression quantile')
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q1'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within group',xlabel='',title = 'within lowest expression quantile')
Out[27]:
[Text(0, 0.5, '% within group'), Text(0.5, 0, ''), Text(0.5, 1.0, 'within lowest expression quantile')]
In [35]:
feature = 'normal_gini_bin'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,4))
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q15'])&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='norm. gini index',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within group',xlabel='',title = 'within top expression quantile')
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q1'])&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within group',xlabel='',title = 'within lowest expression quantile')
Out[35]:
[Text(0, 0.5, '% within group'), Text(0.5, 0, ''), Text(0.5, 1.0, 'within lowest expression quantile')]
In [ ]:
In [ ]:
In [ ]:
In [36]:
feature = 'phastcon_quant'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,4))
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q15'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons quantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(ylabel = '% within\nphast cons\nquantile',xlabel='',title = 'within top expression quantile')
merged_pas_motif_table['t']=1
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant'].isin(['q1'])].groupby(['class',feature]).agg({'t':sum}).reset_index()
gr = pd.merge(gr,gr.groupby(feature).agg({'t':sum}).reset_index().rename(columns={'t':'t_total'}),how='inner',on=feature)
gr['%'] = np.round(gr['t']/gr['t_total']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(ylabel = '% within\nphast cons\nquantile',xlabel='',title = 'within lowest expression quantile')
Out[36]:
[Text(0, 0.5, '% within\nphast cons\nquantile'), Text(0.5, 0, ''), Text(0.5, 1.0, 'within lowest expression quantile')]
In [51]:
merged_pas_motif_table['t']=1
d = {}
for motif in cols[2:7]:
d[motif] = np.sum
d['t'] = np.sum
gr = merged_pas_motif_table.groupby(['class','RPM_quant']).agg(d).reset_index()
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(len(cols[2:7]),1, sharey=False, sharex=True,figsize=(8,1.5*len(cols[2:7])))
i=0
for motif in (cols[2:7]):
gr[motif+'_%'] = np.round(gr[motif]/gr['t']*100,2)
ax = sns.barplot(ax = axes[i],data = gr,x='class',y=motif+'_%',hue='RPM_quant',order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
if i==0:
ax.set(title = '% of PAS with motif, in (class x expr. quantile) category')
if i!=len(cols[2:7])-1:
ax.legend_.remove()
ax.set(xlabel='',xticks=[])
else:
ax.legend(bbox_to_anchor=(1.05, 3),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=1,fontsize=9,mode=None)
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
i=i+1
# fig.tight_layout(pad=0.5)
In [54]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)
y_feature = 'any_canonic_motif_pres'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=True, sharex=True,figsize=(13,2.5))
feature = 'RPM_quant'
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity',xlabel='')
Out[54]:
[Text(0.5, 1.0, '% of PAS with any canonical motif in (-35,-10) vicinity'), Text(0.5, 0, '')]
In [58]:
142884.15536499827
148238.42076623856
6937.294029423888
15320.140209926649
8243.35382939582
4785.684895123209
24165.790583432936
In [ ]:
In [81]:
gr['motif_not_present'] = gr['t']-gr['any_canonic_motif_pres']
# remove the scale influence
gr['motif_pres'] = gr['any_canonic_motif_pres']/gr['t']*100
gr['motif_not_pres'] = gr['motif_not_present']/gr['t']*100
a = []
for class_name in ['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic']:
a.append([class_name,stats.chi2_contingency(observed=gr.loc[gr['class']==class_name][['motif_not_pres','motif_pres']].values,)[0]])
a = pd.DataFrame(a,columns = ['class','chi2_stat'])
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1,1, sharey=True, sharex=True,figsize=(5,2.5))
ax = sns.pointplot(data = a,x='class',y='chi2_stat',order = ['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
Out[81]:
[Text(0, 0, 'TE'), Text(1, 0, 'intronic'), Text(2, 0, 'exonic'), Text(3, 0, 'true_intergenic'), Text(4, 0, 'antisense_TE'), Text(5, 0, 'antisense_exonic'), Text(6, 0, 'antisense_intronic')]
In [72]:
Out[72]:
| 0 | 1 | |
|---|---|---|
| 0 | TE | 142884.155365 |
| 1 | intronic | 148238.420766 |
| 2 | exonic | 6937.294029 |
| 3 | true_intergenic | 15320.140210 |
| 4 | antisense_TE | 8243.353829 |
| 5 | antisense_exonic | 4785.684895 |
| 6 | antisense_intronic | 24165.790583 |
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [48]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)
y_feature = 'any_canonic_motif_pres'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))
feature = 'RPM_quant'
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='expression\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity',xlabel='')
feature = 'phastcon_quant'
gr = merged_pas_motif_table.groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '')
Out[48]:
[Text(0.5, 1.0, '')]
In [ ]:
In [ ]:
In [ ]:
In [103]:
merged_pas_motif_table.head()
Out[103]:
| score | class | AAUAAA | AUUAAA | UAUAAA | AGUAAA | AAUACA | AAUAUA | CAUAAA | GAUAAA | ACUAAA | AAUAGA | phastcon | RPM_log2 | RPM_quant | any_canonic_motif | phastcon_quant | t | any_canonic_motif_pres | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.307465 | true_intergenic | False | False | False | False | False | False | False | False | False | False | 0.000000 | -1.701508 | q3 | 0 | q1 | 1 | False |
| 1 | 0.307465 | true_intergenic | False | False | False | False | False | False | False | False | False | False | 0.000000 | -1.701508 | q3 | 0 | q1 | 1 | False |
| 2 | 6.342241 | true_intergenic | True | False | False | False | False | False | False | False | False | False | 0.000000 | 2.664993 | q13 | 1 | q1 | 1 | True |
| 3 | 0.023180 | true_intergenic | False | True | False | False | False | False | False | False | False | False | 0.001025 | -5.430964 | q1 | 1 | q2 | 1 | True |
| 4 | 1.301714 | true_intergenic | False | False | False | False | False | False | False | False | False | False | 0.009350 | 0.380412 | q9 | 0 | q7 | 1 | False |
In [173]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)
y_feature = 'any_canonic_motif_pres'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))
feature = 'phastcon_quant'
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q15'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='phast cons\nquantile',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity\n\nwithin TOP expression quantile',xlabel='')
feature = 'phastcon_quant'
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q1'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(title = 'within lowest expression quantile')
Out[173]:
[Text(0.5, 1.0, 'within lowest expression quantile')]
In [168]:
merged_pas_motif_table.head()
Out[168]:
| score | class | AAUAAA | AUUAAA | UAUAAA | AGUAAA | AAUACA | AAUAUA | CAUAAA | GAUAAA | ... | any_canonic_motif | phastcon_quant | max_gini | normal_gini | RPM_log2 | RPM_quant | t | normal_gini_bin | num_cs_bins | any_canonic_motif_pres | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.338847 | true_intergenic | False | False | False | False | False | False | False | False | ... | 0 | q14 | 0.000010 | 0.000000 | -1.561296 | q4 | 1 | (-0.001, 0.1] | (0, 1] | False |
| 1 | 0.458394 | true_intergenic | False | False | False | False | False | False | False | False | ... | 1 | q5 | 0.666677 | 0.987850 | -1.125339 | q5 | 1 | (0.9, 1.0] | (2, 3] | True |
| 2 | 0.222169 | true_intergenic | True | False | False | False | False | False | False | False | ... | 1 | q3 | 0.800010 | 0.771199 | -2.170270 | q3 | 1 | (0.7, 0.8] | (4, 5] | True |
| 3 | 0.023180 | true_intergenic | True | False | False | False | False | False | False | False | ... | 1 | q1 | 0.000010 | 0.000000 | -5.430964 | q1 | 1 | (-0.001, 0.1] | (0, 1] | True |
| 4 | 0.867214 | true_intergenic | False | True | False | False | False | False | False | False | ... | 1 | q5 | 0.500010 | 0.104058 | -0.205539 | q8 | 1 | (0.1, 0.2] | (1, 2] | True |
5 rows × 26 columns
In [175]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)
y_feature = 'any_canonic_motif_pres'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))
feature = 'num_cs_bins'
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q15'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='# of cleavage sites',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity\n\nwithin TOP expression quantile',xlabel='')
feature = 'num_cs_bins'
gr = merged_pas_motif_table.loc[merged_pas_motif_table['RPM_quant']=='q1'].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(title = 'within lowest expression quantile')
Out[175]:
[Text(0.5, 1.0, 'within lowest expression quantile')]
In [185]:
merged_pas_motif_table['t']=1
merged_pas_motif_table['any_canonic_motif_pres'] = (merged_pas_motif_table['any_canonic_motif']>0)
y_feature = 'any_canonic_motif_pres'
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(2,1, sharey=True, sharex=True,figsize=(13,5))
feature = 'normal_gini_bin'
gr = merged_pas_motif_table.loc[(merged_pas_motif_table['RPM_quant']=='q15')&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[0],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend(bbox_to_anchor=(1.05,1),loc=2,borderaxespad=0,title='gini index\nPAS with >1 cleavage site',markerscale=1.5,ncols=2,fontsize=9,mode=None)
ax.set(title = '% of PAS with any canonical motif in (-35,-10) vicinity\n\nwithin TOP expression quantile',xlabel='')
feature = 'normal_gini_bin'
gr = merged_pas_motif_table.loc[(merged_pas_motif_table['RPM_quant']=='q1')&(merged_pas_motif_table['num_cs']!=1)].groupby(['class',feature]).agg({'t':sum,y_feature:sum}).reset_index()
gr['%'] = np.round(gr[y_feature]/gr['t']*100,2)
ax = sns.barplot(ax=axes[1],data = gr,x='class',y='%',hue=feature,order=['TE','intronic','exonic','true_intergenic','antisense_TE','antisense_exonic','antisense_intronic'])
ax.legend_.remove()
ax.set(title = 'within lowest expression quantile')
Out[185]:
[Text(0.5, 1.0, 'within lowest expression quantile')]
In [ ]:
In [ ]:
In [13]:
list_of_indices = list(merged_pas_motif_table.loc[(merged_pas_motif_table['RPM_quant']=='q15')&(
merged_pas_motif_table['class'].astype('str')=='intronic')].index)
In [14]:
len(list_of_indices)
Out[14]:
697474
In [16]:
data_full = pd.read_csv('/scicore/home/zavolan/moon0000/intergenic_analysis/result-240228/rcs_motif_check/merged_rcs_motif_phastcon_entropy.bed',delimiter="\t",
index_col=None,header=0,usecols = [3,4])
In [17]:
IPA_bed = data_full.loc[list_of_indices].reset_index(drop=True)
In [20]:
IPA_bed['chr'] = IPA_bed['id'].str.split(':',expand=True)[0]
IPA_bed['start'] = IPA_bed['id'].str.split(':',expand=True)[3]
IPA_bed['end'] = IPA_bed['id'].str.split(':',expand=True)[4]
IPA_bed['strand'] = IPA_bed['id'].str.split(':',expand=True)[2]
In [22]:
IPA_bed[['chr','start','end','id','score','strand']].to_csv('/scicore/home/zavolan/GROUP/IPA/IPA_catalogue/SCINPAS_all_normal_q15Expr.bed', sep=str('\t'),header=False,index=None)
In [ ]:
In [ ]:
In [ ]:
In [19]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [63]:
gr.loc[gr['organ']=='eye'].sort_values('TE_%')
Out[63]:
| organ | project | organ_x_project | TE_% | TE_%_organ | color_x | color_y | |
|---|---|---|---|---|---|---|---|
| 17 | eye | WongAdultRetina | eye WongAdultRetina | 54.300 | 79.7425 | (0.1998437102815942, 0.6927711055021983, 0.448... | (0.1998437102815942, 0.6927711055021983, 0.448... |
| 16 | eye | HumanFoveaRetinaScheetzSheffield | eye HumanFoveaRetinaScheetzSheffield | 72.425 | 79.7425 | (0.1998437102815942, 0.6927711055021983, 0.448... | (0.1998437102815942, 0.6927711055021983, 0.448... |
| 15 | eye | HumanCorneaStemCells | eye HumanCorneaStemCells | 87.060 | 79.7425 | (0.1998437102815942, 0.6927711055021983, 0.448... | (0.1998437102815942, 0.6927711055021983, 0.448... |
| 14 | eye | HumanCorneaDevelopment | eye HumanCorneaDevelopment | 89.260 | 79.7425 | (0.1998437102815942, 0.6927711055021983, 0.448... | (0.1998437102815942, 0.6927711055021983, 0.448... |
In [64]:
merged_num_PAS_each_class_rpm.loc[merged_num_PAS_each_class_rpm['project']=='WongAdultRetina']
Out[64]:
| sample | organ | intronic | exonic | TE | true_intergenic | antisense_intronic | antisense_exonic | antisense_TE | total_read | ... | total_log | antisense_TE_% | antisense_exonic_% | antisense_intronic_% | true_intergenic_% | exonic_% | intronic_% | TE_% | project | organ_x_project | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 412 | 10X_WongAdultRetina_RetinaWongscRNASample1 | eye | 3269.0 | 2056.0 | 8869.0 | 575.0 | 815.0 | 30.0 | 63.0 | 15677.0 | ... | 4.195291 | 0.40 | 0.19 | 5.20 | 3.67 | 13.11 | 20.85 | 56.57 | WongAdultRetina | eye WongAdultRetina |
| 413 | 10X_WongAdultRetina_RetinaWongscRNASample2 | eye | 3599.0 | 2440.0 | 10276.0 | 668.0 | 841.0 | 28.0 | 103.0 | 17955.0 | ... | 4.254210 | 0.57 | 0.16 | 4.68 | 3.72 | 13.59 | 20.04 | 57.23 | WongAdultRetina | eye WongAdultRetina |
| 414 | 10X_WongAdultRetina_RetinaWongscRNASample3 | eye | 1022.0 | 680.0 | 2485.0 | 229.0 | 316.0 | 17.0 | 27.0 | 4776.0 | ... | 3.679155 | 0.57 | 0.36 | 6.62 | 4.79 | 14.24 | 21.40 | 52.03 | WongAdultRetina | eye WongAdultRetina |
| 415 | 10X_WongAdultRetina_RetinaWongscRNASample4 | eye | 898.0 | 558.0 | 1989.0 | 204.0 | 266.0 | 10.0 | 30.0 | 3955.0 | ... | 3.597256 | 0.76 | 0.25 | 6.73 | 5.16 | 14.11 | 22.71 | 50.29 | WongAdultRetina | eye WongAdultRetina |
4 rows × 22 columns
In [ ]:
In [ ]:
In [108]:
lims = (0,6)
sns.set(font_scale=1)
sns.set_style("white")
fig, axes = plt.subplots(1, 1, sharey=False, sharex=False,figsize=(5,5))
merged_num_PAS_each_class['total_log'] = np.log10(merged_num_PAS_each_class['total']+1)
for category in ['antisense_TE','antisense_exonic','antisense_intronic','true_intergenic','exonic','intronic','TE']:
merged_num_PAS_each_class[category+'_log'] = np.log10(merged_num_PAS_each_class[category]+1)
merged_num_PAS_each_class[category+'_%'] = np.round(merged_num_PAS_each_class[category]/merged_num_PAS_each_class['total']*100,2)
ax = sns.regplot(data = merged_num_PAS_each_class,y=category+'_log',x='total_log',label = category,scatter_kws={'s':5})
ax.legend(bbox_to_anchor=(1.05, 1),loc=2,borderaxespad=0,title='PAS class',markerscale=1.5,ncols=1,fontsize=9,mode=None)
ax.set(ylabel = '# supported PAS in a class, $log_{10}$',xlabel = '# total supported PAS, $log_{10}$')
# ax.set(xlim=lims,ylim=lims)
Out[108]:
[Text(0, 0.5, '# supported PAS in a class, $log_{10}$'),
Text(0.5, 0, '# total supported PAS, $log_{10}$')]
Drafts¶
In [1]:
(26+28)/1995
Out[1]:
0.02706766917293233
In [178]:
NPV = 0.9999
FDR = 0.46
alpha = 0.06 # incidence in population
A = (1-NPV)/NPV
B = alpha/(1-alpha)
C = (1-FDR)/FDR
D = (1+(B-A)/(C-B))
X = NPV/(1-NPV)
Y = (1-alpha)/alpha
Z = FDR/(1-FDR)
specificity = X/(Y*(1+(Y-X)/(Z-Y)))
sensitivity = 1-A/(B*D)
np.round(sensitivity,3)*100,np.round(specificity,3)*100
Out[178]:
(99.9, 94.6)
In [69]:
98.5 - 99.2, 90.6 - 94.6
Out[69]:
-0.7000000000000028
In [ ]:
def add_chr_prefix(seqid):
# Check if seqid already starts with 'chr'
if not seqid.startswith('chr'):
return 'chr' + seqid
else:
return seqid
def change_dna_to_rna(sequence, direction):
"""
Parameters
----------
sequence : string
a current genome DNA sub-sequence with same length as motif
(always + strand because reference genome is always + strand).
we need to convert this DNA into RNA so that we can decide
whether this sub-sequence is identical to the motif or not.
direction : character
direction of DNA in which a read maps to.
Returns
-------
corrected_string : string
RNA version (5' -> 3') of the current genome DNA sub-sequence.
Now you can directly compare it with the motif.
i.e. change a subesequence of DNA into RNA so that it becomes compatible with motif (5' -> 3')
"""
corrected_sequence = []
# if a read is mapping to - strand,
# revert the DNA sequence and then make a complementary
if direction == '-':
reverted_subsequence = list(reversed(sequence))
for elem in reverted_subsequence:
if elem == 'A':
corrected_sequence.append('U')
elif elem == 'T':
corrected_sequence.append('A')
elif elem == 'G':
corrected_sequence.append('C')
elif elem == 'C':
corrected_sequence.append('G')
# if a read is mapping to + strand
# only change T in the DNA -> U in RNA. other nucleotides stay the same
elif direction == '+':
for elem in sequence:
if elem == 'A':
corrected_sequence.append('A')
elif elem == 'T':
corrected_sequence.append('U')
elif elem == 'G':
corrected_sequence.append('G')
elif elem == 'C':
corrected_sequence.append('C')
# convert a list of characters into a single string
corrected_string = ''.join(corrected_sequence)
return corrected_string
def get_extra_col(L,group, motives_df, fasta_path):
fasta_f = pysam.FastaFile(fasta_path)
name = group[0]
pas_df = group[1]
rcs_list = list(pas_df['id'])
for index, row in motives_df.iterrows():
motif = row['motif']
# upper and lower boundaries are negative values
# e.g. upper = -35, lower = -10
upper = int(row['upper'])
lower = int(row['lower'])
# print(f'Motif: {motif}, Upper: {upper}, Lower: {lower}')
values = []
for rcs_id in rcs_list:
chrom = add_chr_prefix(rcs_id.split(':')[0])
if chrom == 'not_needed':
continue
rcs = int(rcs_id.split(':')[1])
strand = rcs_id.split(':')[2]
if strand == '-':
# rcs + 10
sequence_start = rcs - lower
# rcs + 35
sequence_end = rcs - upper
elif strand == '+':
# rcs - 35
sequence_start = rcs + upper
# rcs - 10
sequence_end = rcs + lower
dna_subsequence = fasta_f.fetch(reference=chrom, start=sequence_start, end=sequence_end + 1)
# correct DNA subsequence into RNA so that it becomes compatible with motif
corrected_subsequence = change_dna_to_rna(dna_subsequence, strand)
if motif in corrected_subsequence:
# print('corrected_subsequence match : ' + str(corrected_subsequence))
# print('motif match: ' + str(motif))
value = 1
else:
# print('corrected_subsequence no match : ' + str(corrected_subsequence))
# print('motif no match: ' + str(motif))
value = 0
values.append(value)
# assert(len(values) == len(rcs_list))
# append a new column
pas_df[motif] = values
L.append(pas_df)
print(', '.join(group[0])+' done, '+str(time.time()-start_time))
def add_columns_serial(pas_df, motives_df, fasta_path):
final_dfs = []
groups = pas_df.groupby(['seqid', 'strand'])
with Manager() as manager:
L = manager.list()
processes = []
i=0
for group in groups:
p = Process(target=get_extra_col, args=(L,group, motives_df, fasta_path)) # Passing the list
p.start()
processes.append(p)
i=i+1
for p in processes:
p.join()
L = list(L)
final_df = pd.concat(L)
return final_df
start_time = time.time()
v2_intermediate = add_columns_serial(v2_subset, motives, fasta_dir)
v2_intermediate = v2_intermediate.sort_values(['seqid','start','end']).reset_index(drop=True)
v2_final = v2_intermediate.copy()
motif_cols = list(motives['motif'])
v2_final['all_motif'] = v2_final[motif_cols].max(axis=1)
In [ ]:
start_time = time.time()
deep_intermediate = add_columns_serial(deep, motives, fasta_dir)
deep_intermediate = deep_intermediate.sort_values(['seqid','start','end']).reset_index(drop=True)
deep_final = deep_intermediate.copy()
motif_cols = list(motives['motif'])
deep_final['all_motif'] = deep_final[motif_cols].max(axis=1)
In [ ]:
# check confounding of avg usage and tissue-specificity
In [413]:
data['avg_expression'] = np.log2(data[tissues].mean(1)) # mean of means
data['avg_usage'] = data[usage_cols].mean(1) # mean of means
data['qcut_avg_usage'] = pd.qcut(data['avg_usage'],q=10) # quantiles
In [420]:
data['t']=1
gr = data.groupby(['qcut_avg_usage','ts','segment_class']).agg({'t':sum}).reset_index()
# gr['t'] = gr['t']+1 # pseudocount
gr = pd.merge(gr,gr.groupby(['qcut_avg_usage','ts']).agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on=['qcut_avg_usage','ts'])
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['segment_class']=='I'].reset_index(drop=True) #
gr['ts'] = gr['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')
groups_with_na = list(gr.loc[(gr['prop'].isna())|(gr['t_sum']<100)]['qcut_avg_usage'].unique())
gr = gr.loc[~gr['qcut_avg_usage'].isin(groups_with_na)].reset_index(drop=True)
from statsmodels.stats import proportion as smprop
x_feature, y_feature, hue_feature = 'qcut_avg_usage', '%','ts'
order = list(gr['qcut_avg_usage'].unique())
hue_order = ['tissue-specific PAS','other PAS']
palette = ['green','royalblue']
dodge = 0.2
# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
reorder_dict_x[x_val] = i
i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
reorder_dict_hue[hue_val] = i
i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']-dodge*((gr_reordered['hue_order']==0).astype('int')*2-1)
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1.1))
# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette,dodge=True)
ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])],
elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")
ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel = '% of PAS in intronic\nclass')
ax.tick_params(left=True, bottom=True,width=0.5)
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_intronic_fraction.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_intronic_fraction.pdf',bbox_inches='tight',dpi=600)
In [421]:
data['t']=1
gr = data.groupby(['qcut_avg_usage','ts','segment_class']).agg({'t':sum}).reset_index()
# gr['t'] = gr['t']+1 # pseudocount
gr = pd.merge(gr,gr.groupby(['qcut_avg_usage','ts']).agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on=['qcut_avg_usage','ts'])
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['segment_class']=='TE'].reset_index(drop=True) #
gr['ts'] = gr['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')
groups_with_na = list(gr.loc[(gr['prop'].isna())|(gr['t_sum']<100)]['qcut_avg_usage'].unique())
gr = gr.loc[~gr['qcut_avg_usage'].isin(groups_with_na)].reset_index(drop=True)
from statsmodels.stats import proportion as smprop
x_feature, y_feature, hue_feature = 'qcut_avg_usage', '%','ts'
order = list(gr['qcut_avg_usage'].unique())
hue_order = ['tissue-specific PAS','other PAS']
palette = ['green','royalblue']
dodge = 0.2
# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
reorder_dict_x[x_val] = i
i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
reorder_dict_hue[hue_val] = i
i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']-dodge*((gr_reordered['hue_order']==0).astype('int')*2-1)
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1.1))
# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette,dodge=True)
ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])],
elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")
ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel = '% of PAS in\n"terminal exon"\nclass')
ax.tick_params(left=True, bottom=True,width=0.5)
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_TE_fraction.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_TE_fraction.pdf',bbox_inches='tight',dpi=600)
In [426]:
data['t']=1
gr = data.groupby(['qcut_avg_usage','ts','all_motif']).agg({'t':sum}).reset_index()
# gr['t'] = gr['t']+1 # pseudocount
gr = pd.merge(gr,gr.groupby(['qcut_avg_usage','ts']).agg({'t':sum}).reset_index().rename(columns={'t':'t_sum'}),how='inner',on=['qcut_avg_usage','ts'])
gr['%'] = np.round(gr['t']/gr['t_sum']*100,2)
gr['prop'] = gr['t']/gr['t_sum']
gr['%_ci_up'] = np.round(gr.apply(lambda x:binom.ppf(0.975, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr['%_ci_down'] = np.round(gr.apply(lambda x:binom.ppf(0.025, x['t_sum'], x['prop']),1)/gr['t_sum']*100,2)
gr = gr.loc[gr['all_motif']==1].reset_index(drop=True) #
gr['ts'] = gr['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')
groups_with_na = list(gr.loc[(gr['prop'].isna())|(gr['t_sum']<100)]['qcut_avg_usage'].unique())
gr = gr.loc[~gr['qcut_avg_usage'].isin(groups_with_na)].reset_index(drop=True)
from statsmodels.stats import proportion as smprop
x_feature, y_feature, hue_feature = 'qcut_avg_usage', '%','ts'
order = list(gr['qcut_avg_usage'].unique())
hue_order = ['tissue-specific PAS','other PAS']
palette = ['green','royalblue']
dodge = 0.2
# reorder dataframe
reorder_dict_x = {}
i=0
for x_val in order:
reorder_dict_x[x_val] = i
i=i+1
gr['x_order'] = gr[x_feature].map(reorder_dict_x)
reorder_dict_hue = {}
i=0
for hue_val in hue_order:
reorder_dict_hue[hue_val] = i
i=i+1
gr['hue_order'] = gr[hue_feature].map(reorder_dict_hue)
gr_reordered = gr.loc[(~gr['x_order'].isna())&(~gr['hue_order'].isna())].sort_values(['x_order','hue_order']).reset_index(drop=True)
gr_reordered['x_order_adj'] = gr_reordered['x_order']-dodge*((gr_reordered['hue_order']==0).astype('int')*2-1)
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1.1))
# ax = sns.pointplot(data = gr,x=x_feature,y=y_feature,hue=hue,order = order,hue_order = hue_order,palette=palette,dodge=dodge)
ax = sns.barplot(data = gr,x=x_feature,y=y_feature,hue=hue_feature,order = order,hue_order = hue_order,palette=palette,dodge=True)
ax.errorbar(x=list(gr_reordered['x_order_adj']), y=list(gr_reordered[y_feature]), yerr=[list(gr_reordered[y_feature]-gr_reordered['%_ci_down']),list(gr_reordered['%_ci_up']-gr_reordered[y_feature])],
elinewidth = 0.5,capsize=0.7, capthick=0.2,fmt="none", color="black")
ax.legend_.remove()
ax.set_xticklabels(labels = ax.get_xticklabels(), rotation=60, ha='right',va='top',rotation_mode='anchor')
ax.set(xlabel='',ylabel = 'Motif presence, %')
ax.tick_params(left=True, bottom=True,width=0.5)
out = subprocess.check_output('mkdir -p '+subdirs['figures_dir']+'over_tissues/', shell=True)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_motif_presence.png',bbox_inches='tight',dpi=600)
fig.savefig(subdirs['figures_dir']+'over_tissues/std_vs_average_motif_presence.pdf',bbox_inches='tight',dpi=600)
In [217]:
sns.set(font_scale=0.5)
sns.set_style("white")
fig, axes = plt.subplots(1,1,sharey=True,sharex=True, figsize=(2.8, 1))
data_to_show = data.copy()
data_to_show['ts'] = data_to_show['ts'].str.replace('False','other PAS').replace('True','tissue-specific PAS')
ax = sns.boxplot(data = data_to_show,x='ts',y='avg_usage',showfliers=False)